diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-05-10 22:16:03 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.ru> | 2022-05-10 22:16:03 +0300 |
commit | 09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2 (patch) | |
tree | dd44d2cb68e2845c2d4c367b66893f3e043a6e8e /contrib/restricted/aws/s2n/pq-crypto | |
parent | 5eb4a8a2d487411924e1d1b27c454223dcf35005 (diff) | |
download | ydb-09c71d918d4d0b0ebf67e1ab41aa90ddf587a3f2.tar.gz |
Update contrib/restricted/aws/s2n to 1.3.12
ref:f8279d764b4c00974a63543a1364c91e2b81b7a6
Diffstat (limited to 'contrib/restricted/aws/s2n/pq-crypto')
136 files changed, 14877 insertions, 3147 deletions
diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c index 26c99bc80d..2f211010df 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/aes_ctr_prf.c @@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s, bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size); memcpy(key.raw, seed->raw, sizeof(key.raw)); - GUARD(aes256_key_expansion(&s->ks_ptr, &key)); + POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key)); // Initialize buffer and counter s->ctr.u.qw[0] = 0; @@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s) BIKE_ERROR(E_AES_OVER_USED); } - GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr)); + POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr)); s->ctr.u.qw[0]++; s->rem_invokations--; @@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len // Copy full AES blocks while((len - idx) >= AES256_BLOCK_SIZE) { - GUARD(perform_aes(&a[idx], s)); + POSIX_GUARD(perform_aes(&a[idx], s)); idx += AES256_BLOCK_SIZE; } - GUARD(perform_aes(s->buffer.u.bytes, s)); + POSIX_GUARD(perform_aes(s->buffer.u.bytes, s)); // Copy the tail s->pos = len - idx; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c index 21b0b6f5a3..ba43098837 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/bike_r1_kem.c @@ -78,18 +78,18 @@ encrypt(OUT ct_t *ct, p_pk[1].val = pk->val[1]; DMSG(" Sampling m.\n"); - GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION)); + POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION)); DMSG(" Calculating the ciphertext.\n"); - GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0])); - GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1])); + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[0], (uint64_t *)&m, (uint64_t *)&p_pk[0])); + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_ct[1], (uint64_t *)&m, (uint64_t *)&p_pk[1])); DMSG(" Addding Error to the ciphertext.\n"); - GUARD( + POSIX_GUARD( gf2x_add(p_ct[0].val.raw, p_ct[0].val.raw, splitted_e->val[0].raw, R_SIZE)); - GUARD( + POSIX_GUARD( gf2x_add(p_ct[1].val.raw, p_ct[1].val.raw, splitted_e->val[1].raw, R_SIZE)); // Copy the data outside @@ -113,12 +113,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk) // Intialized padding to zero DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup); - GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD)); + POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD)); // Calculate (g0, g1) = (g*h1, g*h0) - GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g, + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g, (const uint64_t *)&p_sk[1])); - GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g, + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g, (const uint64_t *)&p_sk[0])); // Copy the data to the output parameters. @@ -156,7 +156,7 @@ get_ss(OUT ss_t *out, IN const e_t *e) int BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); // Convert to this implementation types pk_t *l_pk = (pk_t *)pk; @@ -177,14 +177,14 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) DMSG(" Calculating the secret key.\n"); // h0 and h1 use the same context - GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); + POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); - GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS, + POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS, sizeof(p_sk[0]), &h_prf_state)); // Copy data l_sk.bin[0] = p_sk[0].val; - GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS, + POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS, sizeof(p_sk[1]), &h_prf_state)); // Copy data @@ -192,7 +192,7 @@ BIKE1_L1_R1_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) DMSG(" Calculating the public key.\n"); - GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk)); + POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk)); memcpy(sk, &l_sk, sizeof(l_sk)); @@ -214,7 +214,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct, IN const unsigned char *pk) { DMSG(" Enter crypto_kem_enc.\n"); - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); // Convert to this implementation types const pk_t *l_pk = (const pk_t *)pk; @@ -231,11 +231,11 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct, // Random data generator // Using first seed - GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); + POSIX_GUARD(init_aes_ctr_prf_state(&e_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); DMSG(" Generating error.\n"); ALIGN(8) compressed_idx_t_t dummy; - GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e), + POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e), &e_prf_state)); print("e: ", (uint64_t *)&e.val, sizeof(e) * 8); @@ -250,7 +250,7 @@ BIKE1_L1_R1_crypto_kem_enc(OUT unsigned char * ct, // Computing ct = enc(pk, e) // Using second seed DMSG(" Encrypting.\n"); - GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e)); + POSIX_GUARD(encrypt(l_ct, l_pk, &seeds.seed[1], &splitted_e)); DMSG(" Generating shared secret.\n"); get_ss(l_ss, &e.val); @@ -269,7 +269,7 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char * ss, IN const unsigned char *sk) { DMSG(" Enter crypto_kem_dec.\n"); - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); // Convert to this implementation types const ct_t *l_ct = (const ct_t *)ct; @@ -284,10 +284,10 @@ BIKE1_L1_R1_crypto_kem_dec(OUT unsigned char * ss, DEFER_CLEANUP(e_t merged_e = {0}, e_cleanup); DMSG(" Computing s.\n"); - GUARD(compute_syndrome(&syndrome, l_ct, &l_sk)); + POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk)); DMSG(" Decoding.\n"); - GUARD(decode(&e, &syndrome, l_ct, &l_sk)); + POSIX_GUARD(decode(&e, &syndrome, l_ct, &l_sk)); // Check if the error weight equals T1 if(T1 != r_bits_vector_weight(&e.val[0]) + r_bits_vector_weight(&e.val[1])) diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c index 404c6377da..b455cd7e82 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/decode.c @@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk) pad_ct[1].val = ct->val[1]; // Compute s = c0*h0 + c1*h1: - GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0], + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0], (uint64_t *)&pad_sk[0])); - GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1], + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1], (uint64_t *)&pad_sk[1])); - GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE)); + POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE)); memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE); dup(syndrome); @@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome, ct_t tmp_ct = *ct; // Adapt the ciphertext - GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw, + POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw, R_SIZE)); - GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw, + POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw, R_SIZE)); // Recompute the syndrome - GUARD(compute_syndrome(syndrome, &tmp_ct, sk)); + POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk)); return SUCCESS; } @@ -334,7 +334,7 @@ decode(OUT split_e_t *e, DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); #ifdef BGF_DECODER if(iter >= 1) { @@ -346,14 +346,14 @@ decode(OUT split_e_t *e, DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); DMSG(" Weight of e: %lu\n", r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1])); DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); } if(r_bits_vector_weight((r_t *)s.qw) > 0) diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c index 09e0af3fde..c80d3365cb 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/openssl_utils.c @@ -108,15 +108,15 @@ ossl_add(OUT uint8_t res_bin[R_SIZE], BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); - GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); if(BN_GF2m_add(r, a, b) == 0) { BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); + POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); return SUCCESS; } @@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t res_bin[R_SIZE], BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); - GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); - GUARD(ossl_cyclic_product(r, a, b, bn_ctx)); - GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); + POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx)); + POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); return SUCCESS; } diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c index 3686338fad..d08fa5eea7 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.c @@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t * rand_pos, do { // Generate 128bit of random numbers - GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos))); + POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos))); // Mask only relevant bits (*rand_pos) &= mask; @@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r, IN const must_be_odd_t must_be_odd) { // Generate random data - GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE)); + POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE)); // Mask upper bits of the MSByte r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8)); @@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t * a, // Generate weight rand numbers do { - GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state)); + POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state)); ctr += is_new(wlist, ctr); } while(ctr < weight); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h index 1ffd56f34a..4ec60683de 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r1/sampling.h @@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r, // For the seedexpander DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup); - GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); + POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); - GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd)); + POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd)); return SUCCESS; } diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c index 26c99bc80d..2f211010df 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/aes_ctr_prf.c @@ -27,7 +27,7 @@ init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s, bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size); memcpy(key.raw, seed->raw, sizeof(key.raw)); - GUARD(aes256_key_expansion(&s->ks_ptr, &key)); + POSIX_GUARD(aes256_key_expansion(&s->ks_ptr, &key)); // Initialize buffer and counter s->ctr.u.qw[0] = 0; @@ -59,7 +59,7 @@ perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s) BIKE_ERROR(E_AES_OVER_USED); } - GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr)); + POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks_ptr)); s->ctr.u.qw[0]++; s->rem_invokations--; @@ -91,11 +91,11 @@ aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN const uint32_t len // Copy full AES blocks while((len - idx) >= AES256_BLOCK_SIZE) { - GUARD(perform_aes(&a[idx], s)); + POSIX_GUARD(perform_aes(&a[idx], s)); idx += AES256_BLOCK_SIZE; } - GUARD(perform_aes(s->buffer.u.bytes, s)); + POSIX_GUARD(perform_aes(s->buffer.u.bytes, s)); // Copy the tail s->pos = len - idx; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c index 8f29f3add9..e7797848a0 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/bike_r2_kem.c @@ -61,12 +61,12 @@ calc_pk(OUT pk_t *pk, IN const seed_t *g_seed, IN const pad_sk_t p_sk) // Intialized padding to zero DEFER_CLEANUP(padded_r_t g = {0}, padded_r_cleanup); - GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD)); + POSIX_GUARD(sample_uniform_r_bits(&g.val, g_seed, MUST_BE_ODD)); // Calculate (g0, g1) = (g*h1, g*h0) - GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g, + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[0], (const uint64_t *)&g, (const uint64_t *)&p_sk[1])); - GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g, + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&p_pk[1], (const uint64_t *)&g, (const uint64_t *)&p_sk[0])); // Copy the data to the output parameters. @@ -102,12 +102,12 @@ function_h(OUT split_e_t *splitted_e, IN const r_t *in0, IN const r_t *in1) // Use the seed to generate a sparse error vector e: DMSG(" Generating random error.\n"); - GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash)); + POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, &seed_for_hash)); DEFER_CLEANUP(padded_e_t e, padded_e_cleanup); DEFER_CLEANUP(ALIGN(8) compressed_idx_t_t dummy, compressed_idx_t_cleanup); - GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e), + POSIX_GUARD(generate_sparse_rep((uint64_t *)&e, dummy.val, T1, N_BITS, sizeof(e), &prf_state)); split_e(splitted_e, &e.val); @@ -120,7 +120,7 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see DEFER_CLEANUP(padded_r_t m = {0}, padded_r_cleanup); DMSG(" Sampling m.\n"); - GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION)); + POSIX_GUARD(sample_uniform_r_bits(&m.val, seed, NO_RESTRICTION)); // Pad the public key pad_pk_t p_pk = {0}; @@ -135,20 +135,20 @@ encrypt(OUT ct_t *ct, OUT split_e_t *mf, IN const pk_t *pk, IN const seed_t *see DEFER_CLEANUP(dbl_pad_ct_t mf_int = {0}, dbl_pad_ct_cleanup); DMSG(" Computing m*f0 and m*f1.\n"); - GUARD( + POSIX_GUARD( gf2x_mod_mul((uint64_t *)&mf_int[0], (uint64_t *)&m, (uint64_t *)&p_pk[0])); - GUARD( + POSIX_GUARD( gf2x_mod_mul((uint64_t *)&mf_int[1], (uint64_t *)&m, (uint64_t *)&p_pk[1])); DEFER_CLEANUP(split_e_t splitted_e, split_e_cleanup); DMSG(" Computing the hash function e <- H(m*f0, m*f1).\n"); - GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val)); + POSIX_GUARD(function_h(&splitted_e, &mf_int[0].val, &mf_int[1].val)); DMSG(" Addding Error to the ciphertext.\n"); - GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw, + POSIX_GUARD(gf2x_add(p_ct[0].val.raw, mf_int[0].val.raw, splitted_e.val[0].raw, R_SIZE)); - GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw, + POSIX_GUARD(gf2x_add(p_ct[1].val.raw, mf_int[1].val.raw, splitted_e.val[1].raw, R_SIZE)); // Copy the data to the output parameters. @@ -174,11 +174,11 @@ reencrypt(OUT pad_ct_t ce, IN const ct_t *l_ct) { // Compute (c0 + e0') and (c1 + e1') - GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE)); - GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE)); + POSIX_GUARD(gf2x_add(ce[0].val.raw, l_ct->val[0].raw, e->val[0].raw, R_SIZE)); + POSIX_GUARD(gf2x_add(ce[1].val.raw, l_ct->val[1].raw, e->val[1].raw, R_SIZE)); // (e0'', e1'') <-- H(c0 + e0', c1 + e1') - GUARD(function_h(e2, &ce[0].val, &ce[1].val)); + POSIX_GUARD(function_h(e2, &ce[0].val, &ce[1].val)); return SUCCESS; } @@ -212,10 +212,10 @@ get_ss(OUT ss_t *out, IN const r_t *in0, IN const r_t *in1, IN const ct_t *ct) int BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); - notnull_check(sk); - notnull_check(pk); + POSIX_ENSURE_REF(sk); + POSIX_ENSURE_REF(pk); // Convert to this implementation types pk_t *l_pk = (pk_t *)pk; @@ -232,27 +232,27 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) DEFER_CLEANUP(pad_sk_t p_sk = {0}, pad_sk_cleanup); // Get the entropy seeds. - GUARD(get_seeds(&seeds)); + POSIX_GUARD(get_seeds(&seeds)); DMSG(" Enter crypto_kem_keypair.\n"); DMSG(" Calculating the secret key.\n"); // h0 and h1 use the same context - GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); + POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); // sigma0/1/2 use the same context. - GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2])); + POSIX_GUARD(init_aes_ctr_prf_state(&s_prf_state, MAX_AES_INVOKATION, &seeds.seed[2])); - GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS, + POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[0], l_sk.wlist[0].val, DV, R_BITS, sizeof(p_sk[0]), &h_prf_state)); // Sample the sigmas - GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state, + POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma0, &s_prf_state, NO_RESTRICTION)); - GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state, + POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(&l_sk.sigma1, &s_prf_state, NO_RESTRICTION)); - GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS, + POSIX_GUARD(generate_sparse_rep((uint64_t *)&p_sk[1], l_sk.wlist[1].val, DV, R_BITS, sizeof(p_sk[1]), &h_prf_state)); // Copy data @@ -261,7 +261,7 @@ BIKE1_L1_R2_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) DMSG(" Calculating the public key.\n"); - GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk)); + POSIX_GUARD(calc_pk(l_pk, &seeds.seed[1], p_sk)); memcpy(sk, &l_sk, sizeof(l_sk)); @@ -286,29 +286,29 @@ BIKE1_L1_R2_crypto_kem_enc(OUT unsigned char * ct, IN const unsigned char *pk) { DMSG(" Enter crypto_kem_enc.\n"); - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); // Convert to the types that are used by this implementation const pk_t *l_pk = (const pk_t *)pk; ct_t * l_ct = (ct_t *)ct; ss_t * l_ss = (ss_t *)ss; - notnull_check(pk); - notnull_check(ct); - notnull_check(ss); + POSIX_ENSURE_REF(pk); + POSIX_ENSURE_REF(ct); + POSIX_ENSURE_REF(ss); // For NIST DRBG_CTR DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup); // Get the entropy seeds. - GUARD(get_seeds(&seeds)); + POSIX_GUARD(get_seeds(&seeds)); DMSG(" Encrypting.\n"); // In fact, seed[0] should be used. // Here, we stay consistent with BIKE's reference code // that chooses the seconde seed. DEFER_CLEANUP(split_e_t mf, split_e_cleanup); - GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1])); + POSIX_GUARD(encrypt(l_ct, &mf, l_pk, &seeds.seed[1])); DMSG(" Generating shared secret.\n"); get_ss(l_ss, &mf.val[0], &mf.val[1], l_ct); @@ -327,14 +327,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char * ss, IN const unsigned char *sk) { DMSG(" Enter crypto_kem_dec.\n"); - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); // Convert to the types used by this implementation const ct_t *l_ct = (const ct_t *)ct; ss_t * l_ss = (ss_t *)ss; - notnull_check(sk); - notnull_check(ct); - notnull_check(ss); + POSIX_ENSURE_REF(sk); + POSIX_ENSURE_REF(ct); + POSIX_ENSURE_REF(ss); DEFER_CLEANUP(ALIGN(8) sk_t l_sk, sk_cleanup); memcpy(&l_sk, sk, sizeof(l_sk)); @@ -344,14 +344,14 @@ BIKE1_L1_R2_crypto_kem_dec(OUT unsigned char * ss, DEFER_CLEANUP(split_e_t e, split_e_cleanup); DMSG(" Computing s.\n"); - GUARD(compute_syndrome(&syndrome, l_ct, &l_sk)); + POSIX_GUARD(compute_syndrome(&syndrome, l_ct, &l_sk)); DMSG(" Decoding.\n"); uint32_t dec_ret = decode(&e, &syndrome, l_ct, &l_sk) != SUCCESS ? 0 : 1; DEFER_CLEANUP(split_e_t e2, split_e_cleanup); DEFER_CLEANUP(pad_ct_t ce, pad_ct_cleanup); - GUARD(reencrypt(ce, &e2, &e, l_ct)); + POSIX_GUARD(reencrypt(ce, &e2, &e, l_ct)); // Check if the decoding is successful. // Check if the error weight equals T1. diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c index 404c6377da..b455cd7e82 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/decode.c @@ -96,12 +96,12 @@ compute_syndrome(OUT syndrome_t *syndrome, IN const ct_t *ct, IN const sk_t *sk) pad_ct[1].val = ct->val[1]; // Compute s = c0*h0 + c1*h1: - GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0], + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[0], (uint64_t *)&pad_ct[0], (uint64_t *)&pad_sk[0])); - GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1], + POSIX_GUARD(gf2x_mod_mul((uint64_t *)&pad_s[1], (uint64_t *)&pad_ct[1], (uint64_t *)&pad_sk[1])); - GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE)); + POSIX_GUARD(gf2x_add(pad_s[0].val.raw, pad_s[0].val.raw, pad_s[1].val.raw, R_SIZE)); memcpy((uint8_t *)syndrome->qw, pad_s[0].val.raw, R_SIZE); dup(syndrome); @@ -118,13 +118,13 @@ recompute_syndrome(OUT syndrome_t *syndrome, ct_t tmp_ct = *ct; // Adapt the ciphertext - GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw, + POSIX_GUARD(gf2x_add(tmp_ct.val[0].raw, tmp_ct.val[0].raw, splitted_e->val[0].raw, R_SIZE)); - GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw, + POSIX_GUARD(gf2x_add(tmp_ct.val[1].raw, tmp_ct.val[1].raw, splitted_e->val[1].raw, R_SIZE)); // Recompute the syndrome - GUARD(compute_syndrome(syndrome, &tmp_ct, sk)); + POSIX_GUARD(compute_syndrome(syndrome, &tmp_ct, sk)); return SUCCESS; } @@ -334,7 +334,7 @@ decode(OUT split_e_t *e, DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); #ifdef BGF_DECODER if(iter >= 1) { @@ -346,14 +346,14 @@ decode(OUT split_e_t *e, DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); DMSG(" Weight of e: %lu\n", r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1])); DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1); - GUARD(recompute_syndrome(&s, ct, sk, e)); + POSIX_GUARD(recompute_syndrome(&s, ct, sk, e)); } if(r_bits_vector_weight((r_t *)s.qw) > 0) diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c index 09e0af3fde..c80d3365cb 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/openssl_utils.c @@ -108,15 +108,15 @@ ossl_add(OUT uint8_t res_bin[R_SIZE], BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); - GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); if(BN_GF2m_add(r, a, b) == 0) { BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); + POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); return SUCCESS; } @@ -176,10 +176,10 @@ cyclic_product(OUT uint8_t res_bin[R_SIZE], BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); } - GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); - GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); - GUARD(ossl_cyclic_product(r, a, b, bn_ctx)); - GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(a, a_bin, R_SIZE)); + POSIX_GUARD(ossl_bin2bn(b, b_bin, R_SIZE)); + POSIX_GUARD(ossl_cyclic_product(r, a, b, bn_ctx)); + POSIX_GUARD(ossl_bn2bin(res_bin, r, R_SIZE)); return SUCCESS; } diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c index 3686338fad..d08fa5eea7 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.c @@ -20,7 +20,7 @@ get_rand_mod_len(OUT uint32_t * rand_pos, do { // Generate 128bit of random numbers - GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos))); + POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos))); // Mask only relevant bits (*rand_pos) &= mask; @@ -56,7 +56,7 @@ sample_uniform_r_bits_with_fixed_prf_context(OUT r_t *r, IN const must_be_odd_t must_be_odd) { // Generate random data - GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE)); + POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_SIZE)); // Mask upper bits of the MSByte r->raw[R_SIZE - 1] &= MASK(R_BITS + 8 - (R_SIZE * 8)); @@ -104,7 +104,7 @@ generate_sparse_rep(OUT uint64_t * a, // Generate weight rand numbers do { - GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state)); + POSIX_GUARD(get_rand_mod_len(&wlist[ctr], len, prf_state)); ctr += is_new(wlist, ctr); } while(ctr < weight); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h index 1ffd56f34a..4ec60683de 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r2/sampling.h @@ -53,9 +53,9 @@ sample_uniform_r_bits(OUT r_t *r, // For the seedexpander DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup); - GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); + POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); - GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd)); + POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd)); return SUCCESS; } diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE new file mode 100644 index 0000000000..7a4a3ea242 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License.
\ No newline at end of file diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h new file mode 100644 index 0000000000..b8b04c3655 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes.h @@ -0,0 +1,62 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include <openssl/evp.h> + +#include "cleanup.h" + +#define MAX_AES_INVOKATION (MASK(32)) + +#define AES256_KEY_BYTES (32U) +#define AES256_KEY_BITS (AES256_KEY_BYTES * 8) +#define AES256_BLOCK_BYTES (16U) +#define AES256_ROUNDS (14U) + +typedef ALIGN(16) struct aes256_key_s { + uint8_t raw[AES256_KEY_BYTES]; +} aes256_key_t; + +CLEANUP_FUNC(aes256_key, aes256_key_t) + +// Using OpenSSL structures +typedef EVP_CIPHER_CTX *aes256_ks_t; + +_INLINE_ ret_t aes256_key_expansion(OUT aes256_ks_t *ks, + IN const aes256_key_t *key) +{ + *ks = EVP_CIPHER_CTX_new(); + if(*ks == NULL) { + BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); + } + if(0 == EVP_EncryptInit_ex(*ks, EVP_aes_256_ecb(), NULL, key->raw, NULL)) { + EVP_CIPHER_CTX_free(*ks); + BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); + } + + EVP_CIPHER_CTX_set_padding(*ks, 0); + + return SUCCESS; +} + +_INLINE_ ret_t aes256_enc(OUT uint8_t *ct, + IN const uint8_t *pt, + IN const aes256_ks_t *ks) +{ + int outlen = 0; + if(0 == EVP_EncryptUpdate(*ks, ct, &outlen, pt, AES256_BLOCK_BYTES)) { + BIKE_ERROR(EXTERNAL_LIB_ERROR_OPENSSL); + } + return SUCCESS; +} + +_INLINE_ void aes256_free_ks(OUT aes256_ks_t *ks) +{ + EVP_CIPHER_CTX_free(*ks); + ks = NULL; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c new file mode 100644 index 0000000000..9b50469ef1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.c @@ -0,0 +1,97 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include "aes_ctr_prf.h" +#include "utilities.h" + +ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s, + IN const uint32_t max_invokations, + IN const seed_t *seed) +{ + if(0 == max_invokations) { + BIKE_ERROR(E_AES_CTR_PRF_INIT_FAIL); + } + + // Set the key schedule (from seed). + // Make sure the size matches the AES256 key size. + DEFER_CLEANUP(aes256_key_t key, aes256_key_cleanup); + + bike_static_assert(sizeof(*seed) == sizeof(key.raw), seed_size_equals_ky_size); + bike_memcpy(key.raw, seed->raw, sizeof(key.raw)); + + POSIX_GUARD(aes256_key_expansion(&s->ks, &key)); + + // Initialize buffer and counter + s->ctr.u.qw[0] = 0; + s->ctr.u.qw[1] = 0; + s->buffer.u.qw[0] = 0; + s->buffer.u.qw[1] = 0; + + s->pos = AES256_BLOCK_BYTES; + s->rem_invokations = max_invokations; + + DMSG(" Init aes_prf_ctr state:\n"); + DMSG(" s.pos = %d\n", s->pos); + DMSG(" s.rem_invokations = %u\n", s->rem_invokations); + + return SUCCESS; +} + +_INLINE_ ret_t perform_aes(OUT uint8_t *ct, IN OUT aes_ctr_prf_state_t *s) +{ + // Ensure that the CTR is large enough + bike_static_assert( + ((sizeof(s->ctr.u.qw[0]) == 8) && (BIT(33) >= MAX_AES_INVOKATION)), + ctr_size_is_too_small); + + if(0 == s->rem_invokations) { + BIKE_ERROR(E_AES_OVER_USED); + } + + POSIX_GUARD(aes256_enc(ct, s->ctr.u.bytes, &s->ks)); + + s->ctr.u.qw[0]++; + s->rem_invokations--; + + return SUCCESS; +} + +ret_t aes_ctr_prf(OUT uint8_t *a, + IN OUT aes_ctr_prf_state_t *s, + IN const uint32_t len) +{ + // When Len is smaller than use what's left in the buffer, + // there is no need for additional AES invocations. + if((len + s->pos) <= AES256_BLOCK_BYTES) { + bike_memcpy(a, &s->buffer.u.bytes[s->pos], len); + s->pos += len; + + return SUCCESS; + } + + // If s.pos != AES256_BLOCK_BYTES then copy what's left in the buffer. + // Else copy zero bytes + uint32_t idx = AES256_BLOCK_BYTES - s->pos; + bike_memcpy(a, &s->buffer.u.bytes[s->pos], idx); + + // Init s.pos + s->pos = 0; + + // Copy full AES blocks + while((len - idx) >= AES256_BLOCK_BYTES) { + POSIX_GUARD(perform_aes(&a[idx], s)); + idx += AES256_BLOCK_BYTES; + } + + POSIX_GUARD(perform_aes(s->buffer.u.bytes, s)); + + // Copy the tail + s->pos = len - idx; + bike_memcpy(&a[idx], s->buffer.u.bytes, s->pos); + + return SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h new file mode 100644 index 0000000000..684a52a6fc --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/aes_ctr_prf.h @@ -0,0 +1,43 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "aes.h" + +////////////////////////////// +// Types +///////////////////////////// + +typedef struct aes_ctr_prf_state_s { + uint128_t ctr; + uint128_t buffer; + aes256_ks_t ks; + uint32_t rem_invokations; + uint8_t pos; +} aes_ctr_prf_state_t; + +////////////////////////////// +// Methods +///////////////////////////// + +ret_t init_aes_ctr_prf_state(OUT aes_ctr_prf_state_t *s, + IN uint32_t max_invokations, + IN const seed_t *seed); + +ret_t aes_ctr_prf(OUT uint8_t *a, IN OUT aes_ctr_prf_state_t *s, IN uint32_t len); + +_INLINE_ void finalize_aes_ctr_prf(IN OUT aes_ctr_prf_state_t *s) +{ + aes256_free_ks(&s->ks); + secure_clean((uint8_t *)s, sizeof(*s)); +} + +_INLINE_ void aes_ctr_prf_state_cleanup(IN OUT aes_ctr_prf_state_t *s) +{ + finalize_aes_ctr_prf(s); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h new file mode 100644 index 0000000000..697efd0627 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_defs.h @@ -0,0 +1,91 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "defs.h" + +//////////////////////////////////////////// +// BIKE Parameters +/////////////////////////////////////////// +#define N0 2 + +#if !defined(LEVEL) +# define LEVEL 1 +#endif + +#if(LEVEL == 3) +# define R_BITS 24659 +# define DV 103 +# define T1 199 + +# define THRESHOLD_COEFF0 15.2588 +# define THRESHOLD_COEFF1 0.005265 +# define THRESHOLD_MIN 52 + +// The gf2m code is optimized to a block in this case: +# define BLOCK_BITS 32768 +#elif(LEVEL == 1) +// 64-bits of post-quantum security parameters (BIKE paper): +# define R_BITS 12323 +# define DV 71 +# define T1 134 + +# define THRESHOLD_COEFF0 13.530 +# define THRESHOLD_COEFF1 0.0069722 +# define THRESHOLD_MIN 36 + +// The gf2x code is optimized to a block in this case: +# define BLOCK_BITS (16384) +#else +# error "Bad level, choose one of 1/3/5" +#endif + +#define NUM_OF_SEEDS 2 + +// Round the size to the nearest byte. +// SIZE suffix, is the number of bytes (uint8_t). +#define N_BITS (R_BITS * N0) +#define R_BYTES DIVIDE_AND_CEIL(R_BITS, 8) +#define R_QWORDS DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_QWORD) +#define R_XMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_XMM) +#define R_YMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_YMM) +#define R_ZMM DIVIDE_AND_CEIL(R_BITS, 8 * BYTES_IN_ZMM) + +#define R_BLOCKS DIVIDE_AND_CEIL(R_BITS, BLOCK_BITS) +#define R_PADDED (R_BLOCKS * BLOCK_BITS) +#define R_PADDED_BYTES (R_PADDED / 8) +#define R_PADDED_QWORDS (R_PADDED / 64) + +#define LAST_R_QWORD_LEAD (R_BITS & MASK(6)) +#define LAST_R_QWORD_TRAIL (64 - LAST_R_QWORD_LEAD) +#define LAST_R_QWORD_MASK MASK(LAST_R_QWORD_LEAD) + +#define LAST_R_BYTE_LEAD (R_BITS & MASK(3)) +#define LAST_R_BYTE_TRAIL (8 - LAST_R_BYTE_LEAD) +#define LAST_R_BYTE_MASK MASK(LAST_R_BYTE_LEAD) + +// Data alignement +#define ALIGN_BYTES (BYTES_IN_ZMM) + +#define M_BITS 256 +#define M_BYTES (M_BITS / 8) + +#define SS_BITS 256 +#define SS_BYTES (SS_BITS / 8) + +#define SEED_BYTES (256 / 8) + +////////////////////////////////// +// Parameters for the BGF decoder. +////////////////////////////////// +#define BGF_DECODER +#define DELTA 3 +#define SLICES (LOG2_MSB(DV) + 1) + +// GF2X inversion can only handle R < 32768 +bike_static_assert((R_BITS < 32768), r_too_large_for_inversion); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c new file mode 100644 index 0000000000..328bb52db8 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/bike_r3_kem.c @@ -0,0 +1,288 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron, and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include "decode.h" +#include "gf2x.h" +#include "sampling.h" +#include "sha.h" +#include "tls/s2n_kem.h" +#include "pq-crypto/s2n_pq.h" + +// m_t and seed_t have the same size and thus can be considered +// to be of the same type. However, for security reasons we distinguish +// these types, even on the costs of small extra complexity. +_INLINE_ void convert_seed_to_m_type(OUT m_t *m, IN const seed_t *seed) +{ + bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size); + bike_memcpy(m->raw, seed->raw, sizeof(*m)); +} + +_INLINE_ void convert_m_to_seed_type(OUT seed_t *seed, IN const m_t *m) +{ + bike_static_assert(sizeof(*m) == sizeof(*seed), m_size_eq_seed_size); + bike_memcpy(seed->raw, m->raw, sizeof(*seed)); +} + +// (e0, e1) = H(m) +_INLINE_ ret_t function_h(OUT pad_e_t *e, IN const m_t *m) +{ + DEFER_CLEANUP(seed_t seed = {0}, seed_cleanup); + + convert_m_to_seed_type(&seed, m); + return generate_error_vector(e, &seed); +} + +// out = L(e) +_INLINE_ ret_t function_l(OUT m_t *out, IN const pad_e_t *e) +{ + DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup); + DEFER_CLEANUP(e_t tmp, e_cleanup); + + // Take the padding away + tmp.val[0] = e->val[0].val; + tmp.val[1] = e->val[1].val; + + POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp)); + + // Truncate the SHA384 digest to a 256-bits m_t + bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_m_size); + bike_memcpy(out->raw, dgst.u.raw, sizeof(*out)); + + return SUCCESS; +} + +// Generate the Shared Secret K(m, c0, c1) +_INLINE_ ret_t function_k(OUT ss_t *out, IN const m_t *m, IN const ct_t *ct) +{ + DEFER_CLEANUP(func_k_t tmp, func_k_cleanup); + DEFER_CLEANUP(sha_dgst_t dgst = {0}, sha_dgst_cleanup); + + // Copy every element, padded to the nearest byte + tmp.m = *m; + tmp.c0 = ct->c0; + tmp.c1 = ct->c1; + + POSIX_GUARD(sha(&dgst, sizeof(tmp), (uint8_t *)&tmp)); + + // Truncate the SHA384 digest to a 256-bits value + // to subsequently use it as a seed. + bike_static_assert(sizeof(dgst) >= sizeof(*out), dgst_size_lt_out_size); + bike_memcpy(out->raw, dgst.u.raw, sizeof(*out)); + + return SUCCESS; +} + +_INLINE_ ret_t encrypt(OUT ct_t *ct, + IN const pad_e_t *e, + IN const pk_t *pk, + IN const m_t *m) +{ + // Pad the public key and the ciphertext + pad_r_t p_ct = {0}; + pad_r_t p_pk = {0}; + p_pk.val = *pk; + + // Generate the ciphertext + // ct = pk * e1 + e0 + gf2x_mod_mul(&p_ct, &e->val[1], &p_pk); + gf2x_mod_add(&p_ct, &p_ct, &e->val[0]); + + ct->c0 = p_ct.val; + + // c1 = L(e0, e1) + POSIX_GUARD(function_l(&ct->c1, e)); + + // m xor L(e0, e1) + for(size_t i = 0; i < sizeof(*m); i++) { + ct->c1.raw[i] ^= m->raw[i]; + } + + return SUCCESS; +} + +_INLINE_ ret_t reencrypt(OUT m_t *m, IN const pad_e_t *e, IN const ct_t *l_ct) +{ + DEFER_CLEANUP(m_t tmp, m_cleanup); + + POSIX_GUARD(function_l(&tmp, e)); + + // m' = c1 ^ L(e') + for(size_t i = 0; i < sizeof(*m); i++) { + m->raw[i] = tmp.raw[i] ^ l_ct->c1.raw[i]; + } + + return SUCCESS; +} + +//////////////////////////////////////////////////////////////////////////////// +// The three APIs below (keypair, encapsulate, decapsulate) are defined by NIST: +//////////////////////////////////////////////////////////////////////////////// +int BIKE_L1_R3_crypto_kem_keypair(OUT unsigned char *pk, OUT unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE_REF(sk); + POSIX_ENSURE_REF(pk); + + DEFER_CLEANUP(aligned_sk_t l_sk = {0}, sk_cleanup); + + // The secret key is (h0, h1), + // and the public key h=(h0^-1 * h1). + // Padded structures are used internally, and are required by the + // decoder and the gf2x multiplication. + DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t h1 = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t h0inv = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t h = {0}, pad_r_cleanup); + + // The randomness of the key generation + DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup); + + // An AES_PRF state for the secret key + DEFER_CLEANUP(aes_ctr_prf_state_t h_prf_state = {0}, aes_ctr_prf_state_cleanup); + + POSIX_GUARD(get_seeds(&seeds)); + POSIX_GUARD(init_aes_ctr_prf_state(&h_prf_state, MAX_AES_INVOKATION, &seeds.seed[0])); + + // Generate the secret key (h0, h1) with weight w/2 + POSIX_GUARD(generate_sparse_rep(&h0, l_sk.wlist[0].val, &h_prf_state)); + POSIX_GUARD(generate_sparse_rep(&h1, l_sk.wlist[1].val, &h_prf_state)); + + // Generate sigma + convert_seed_to_m_type(&l_sk.sigma, &seeds.seed[1]); + + // Calculate the public key + gf2x_mod_inv(&h0inv, &h0); + gf2x_mod_mul(&h, &h1, &h0inv); + + // Fill the secret key data structure with contents - cancel the padding + l_sk.bin[0] = h0.val; + l_sk.bin[1] = h1.val; + l_sk.pk = h.val; + + // Copy the data to the output buffers + bike_memcpy(sk, &l_sk, sizeof(l_sk)); + bike_memcpy(pk, &l_sk.pk, sizeof(l_sk.pk)); + + return SUCCESS; +} + +// Encapsulate - pk is the public key, +// ct is a key encapsulation message (ciphertext), +// ss is the shared secret. +int BIKE_L1_R3_crypto_kem_enc(OUT unsigned char * ct, + OUT unsigned char * ss, + IN const unsigned char *pk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE_REF(pk); + POSIX_ENSURE_REF(ct); + POSIX_ENSURE_REF(ss); + + // Public values (they do not require cleanup on exit). + pk_t l_pk; + ct_t l_ct; + + DEFER_CLEANUP(m_t m, m_cleanup); + DEFER_CLEANUP(ss_t l_ss, ss_cleanup); + DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup); + DEFER_CLEANUP(pad_e_t e, pad_e_cleanup); + + // Copy the data from the input buffer. This is required in order to avoid + // alignment issues on non x86_64 processors. + bike_memcpy(&l_pk, pk, sizeof(l_pk)); + + POSIX_GUARD(get_seeds(&seeds)); + + // e = H(m) = H(seed[0]) + convert_seed_to_m_type(&m, &seeds.seed[0]); + POSIX_GUARD(function_h(&e, &m)); + + // Calculate the ciphertext + POSIX_GUARD(encrypt(&l_ct, &e, &l_pk, &m)); + + // Generate the shared secret + POSIX_GUARD(function_k(&l_ss, &m, &l_ct)); + + // Copy the data to the output buffers + bike_memcpy(ct, &l_ct, sizeof(l_ct)); + bike_memcpy(ss, &l_ss, sizeof(l_ss)); + + return SUCCESS; +} + +// Decapsulate - ct is a key encapsulation message (ciphertext), +// sk is the private key, +// ss is the shared secret +int BIKE_L1_R3_crypto_kem_dec(OUT unsigned char * ss, + IN const unsigned char *ct, + IN const unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE_REF(sk); + POSIX_ENSURE_REF(ct); + POSIX_ENSURE_REF(ss); + + // Public values, does not require a cleanup on exit + ct_t l_ct; + + DEFER_CLEANUP(seeds_t seeds = {0}, seeds_cleanup); + + DEFER_CLEANUP(ss_t l_ss, ss_cleanup); + DEFER_CLEANUP(aligned_sk_t l_sk, sk_cleanup); + DEFER_CLEANUP(e_t e, e_cleanup); + DEFER_CLEANUP(m_t m_prime, m_cleanup); + DEFER_CLEANUP(pad_e_t e_tmp, pad_e_cleanup); + DEFER_CLEANUP(pad_e_t e_prime, pad_e_cleanup); + + // Copy the data from the input buffers. This is required in order to avoid + // alignment issues on non x86_64 processors. + bike_memcpy(&l_ct, ct, sizeof(l_ct)); + bike_memcpy(&l_sk, sk, sizeof(l_sk)); + + // Generate a random error vector to be used in case of decoding failure + // (Note: possibly, a "fixed" zeroed error vector could suffice too, + // and serve this generation) + POSIX_GUARD(get_seeds(&seeds)); + POSIX_GUARD(generate_error_vector(&e_prime, &seeds.seed[0])); + + // Decode and on success check if |e|=T (all in constant-time) + volatile uint32_t success_cond = (decode(&e, &l_ct, &l_sk) == SUCCESS); + success_cond &= secure_cmp32(T1, r_bits_vector_weight(&e.val[0]) + + r_bits_vector_weight(&e.val[1])); + + // Set appropriate error based on the success condition + uint8_t mask = ~secure_l32_mask(0, success_cond); + for(size_t i = 0; i < R_BYTES; i++) { + PE0_RAW(&e_prime)[i] &= u8_barrier(~mask); + PE0_RAW(&e_prime)[i] |= (u8_barrier(mask) & E0_RAW(&e)[i]); + PE1_RAW(&e_prime)[i] &= u8_barrier(~mask); + PE1_RAW(&e_prime)[i] |= (u8_barrier(mask) & E1_RAW(&e)[i]); + } + + POSIX_GUARD(reencrypt(&m_prime, &e_prime, &l_ct)); + + // Check if H(m') is equal to (e0', e1') + // (in constant-time) + POSIX_GUARD(function_h(&e_tmp, &m_prime)); + success_cond = secure_cmp(PE0_RAW(&e_prime), PE0_RAW(&e_tmp), R_BYTES); + success_cond &= secure_cmp(PE1_RAW(&e_prime), PE1_RAW(&e_tmp), R_BYTES); + + // Compute either K(m', C) or K(sigma, C) based on the success condition + mask = secure_l32_mask(0, success_cond); + for(size_t i = 0; i < M_BYTES; i++) { + m_prime.raw[i] &= u8_barrier(~mask); + m_prime.raw[i] |= (u8_barrier(mask) & l_sk.sigma.raw[i]); + } + + // Generate the shared secret + POSIX_GUARD(function_k(&l_ss, &m_prime, &l_ct)); + + // Copy the data into the output buffer + bike_memcpy(ss, &l_ss, sizeof(l_ss)); + + return SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h new file mode 100644 index 0000000000..22e8c44250 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/cleanup.h @@ -0,0 +1,63 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "utilities.h" + +/* Runs _thecleanup function on _thealloc once _thealloc went out of scope */ +#define DEFER_CLEANUP(_thealloc, _thecleanup) \ + __attribute__((cleanup(_thecleanup))) _thealloc + +// len is bytes length of in +_INLINE_ void secure_clean(OUT uint8_t *p, IN const uint32_t len) +{ +#if defined(_WIN32) + SecureZeroMemory(p, len); +#else + typedef void *(*memset_t)(void *, int, size_t); + static volatile memset_t memset_func = bike_memset; + memset_func(p, 0, len); +#endif +} + +#define CLEANUP_FUNC(name, type) \ + _INLINE_ void name##_cleanup(IN OUT type *o) \ + { \ + secure_clean((uint8_t *)o, sizeof(*o)); \ + } + +CLEANUP_FUNC(r, r_t) +CLEANUP_FUNC(m, m_t) +CLEANUP_FUNC(e, e_t) +CLEANUP_FUNC(sk, sk_t) +CLEANUP_FUNC(ss, ss_t) +CLEANUP_FUNC(ct, ct_t) +CLEANUP_FUNC(pad_r, pad_r_t) +CLEANUP_FUNC(pad_e, pad_e_t) +CLEANUP_FUNC(seed, seed_t) +CLEANUP_FUNC(syndrome, syndrome_t) +CLEANUP_FUNC(upc, upc_t) +CLEANUP_FUNC(func_k, func_k_t) +CLEANUP_FUNC(dbl_pad_r, dbl_pad_r_t) + +// The functions below require special handling because we deal +// with arrays and not structures. + +_INLINE_ void compressed_idx_d_ar_cleanup(IN OUT compressed_idx_d_ar_t *o) +{ + for(int i = 0; i < N0; i++) { + secure_clean((uint8_t *)&(*o)[i], sizeof((*o)[0])); + } +} + +_INLINE_ void seeds_cleanup(IN OUT seeds_t *o) +{ + for(int i = 0; i < NUM_OF_SEEDS; i++) { + seed_cleanup(&(o->seed[i])); + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c new file mode 100644 index 0000000000..c280b95f03 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.c @@ -0,0 +1,280 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * [1] The optimizations are based on the description developed in the paper: + * Drucker, Nir, and Shay Gueron. 2019. “A Toolbox for Software Optimization + * of QC-MDPC Code-Based Cryptosystems.” Journal of Cryptographic Engineering, + * January, 1–17. https://doi.org/10.1007/s13389-018-00200-4. + * + * [2] The decoder algorithm is the Black-Gray decoder in + * the early submission of CAKE (due to N. Sandrier and R Misoczki). + * + * [3] The analysis for the constant time implementation is given in + * Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019. + * “On Constant-Time QC-MDPC Decoding with Negligible Failure Rate.” + * Cryptology EPrint Archive, 2019. https://eprint.iacr.org/2019/1289. + * + * [4] it was adapted to BGF in: + * Drucker, Nir, Shay Gueron, and Dusan Kostic. 2019. + * “QC-MDPC decoders with several shades of gray.” + * Cryptology EPrint Archive, 2019. To be published. + * + * [5] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography. + * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware + * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg, + * Berlin, Heidelberg (2016) + * + * [6] The rotate512_small funciton is a derivative of the code described in: + * Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019. + * “Optimized Implementation of QC-MDPC Code-Based Cryptography.” + * Concurrency and Computation: Practice and Experience 31 (18): + * e5089. https://doi.org/10.1002/cpe.5089. + */ + +#include "decode.h" +#include "cleanup.h" +#include "decode_internal.h" +#include "gf2x.h" +#include "utilities.h" + +// Decoding (bit-flipping) parameter +#if defined(BG_DECODER) +# if(LEVEL == 1) +# define MAX_IT 3 +# elif(LEVEL == 3) +# define MAX_IT 4 +# else +# error "Level can only be 1/3" +# endif +#elif defined(BGF_DECODER) +# if(LEVEL == 1) +# define MAX_IT 5 +# elif(LEVEL == 3) +# define MAX_IT 5 +# else +# error "Level can only be 1/3" +# endif +#endif + +ret_t compute_syndrome(OUT syndrome_t *syndrome, + IN const pad_r_t *c0, + IN const pad_r_t *h0, + IN const decode_ctx *ctx) +{ + DEFER_CLEANUP(pad_r_t pad_s, pad_r_cleanup); + + gf2x_mod_mul(&pad_s, c0, h0); + + bike_memcpy((uint8_t *)syndrome->qw, pad_s.val.raw, R_BYTES); + ctx->dup(syndrome); + + return SUCCESS; +} + +_INLINE_ ret_t recompute_syndrome(OUT syndrome_t *syndrome, + IN const pad_r_t *c0, + IN const pad_r_t *h0, + IN const pad_r_t *pk, + IN const e_t *e, + IN const decode_ctx *ctx) +{ + DEFER_CLEANUP(pad_r_t tmp_c0, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t e0 = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t e1 = {0}, pad_r_cleanup); + + e0.val = e->val[0]; + e1.val = e->val[1]; + + // tmp_c0 = pk * e1 + c0 + e0 + gf2x_mod_mul(&tmp_c0, &e1, pk); + gf2x_mod_add(&tmp_c0, &tmp_c0, c0); + gf2x_mod_add(&tmp_c0, &tmp_c0, &e0); + + // Recompute the syndrome using the updated ciphertext + POSIX_GUARD(compute_syndrome(syndrome, &tmp_c0, h0, ctx)); + + return SUCCESS; +} + +_INLINE_ uint8_t get_threshold(IN const syndrome_t *s) +{ + bike_static_assert(sizeof(*s) >= sizeof(r_t), syndrome_is_large_enough); + + const uint32_t syndrome_weight = r_bits_vector_weight((const r_t *)s->qw); + + // The equations below are defined in BIKE's specification p. 16, Section 5.2 + uint32_t thr = THRESHOLD_COEFF0 + (THRESHOLD_COEFF1 * syndrome_weight); + const uint32_t mask = secure_l32_mask(thr, THRESHOLD_MIN); + thr = (u32_barrier(mask) & thr) | (u32_barrier(~mask) & THRESHOLD_MIN); + + DMSG(" Threshold: %d\n", thr); + return thr; +} + +// Calculate the Unsatisfied Parity Checks (UPCs) and update the errors +// vector (e) accordingly. In addition, update the black and gray errors vector +// with the relevant values. +_INLINE_ void find_err1(OUT e_t *e, + OUT e_t *black_e, + OUT e_t *gray_e, + IN const syndrome_t * syndrome, + IN const compressed_idx_d_ar_t wlist, + IN const uint8_t threshold, + IN const decode_ctx *ctx) +{ + // This function uses the bit-slice-adder methodology of [5]: + DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup); + DEFER_CLEANUP(upc_t upc, upc_cleanup); + + for(uint32_t i = 0; i < N0; i++) { + // UPC must start from zero at every iteration + bike_memset(&upc, 0, sizeof(upc)); + + // 1) Right-rotate the syndrome for every secret key set bit index + // Then slice-add it to the UPC array. + for(size_t j = 0; j < DV; j++) { + ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]); + ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1)); + } + + // 2) Subtract the threshold from the UPC counters + ctx->bit_slice_full_subtract(&upc, threshold); + + // 3) Update the errors and the black errors vectors. + // The last slice of the UPC array holds the MSB of the accumulated values + // minus the threshold. Every zero bit indicates a potential error bit. + // The errors values are stored in the black array and xored with the + // errors Of the previous iteration. + const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val); + for(size_t j = 0; j < R_BYTES; j++) { + const uint8_t sum_msb = (~last_slice->raw[j]); + black_e->val[i].raw[j] = sum_msb; + e->val[i].raw[j] ^= sum_msb; + } + + // Ensure that the padding bits (upper bits of the last byte) are zero so + // they will not be included in the multiplication and in the hash function. + e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK; + + // 4) Calculate the gray error array by adding "DELTA" to the UPC array. + // For that we reuse the rotated_syndrome variable setting it to all "1". + for(size_t l = 0; l < DELTA; l++) { + bike_memset((uint8_t *)rotated_syndrome.qw, 0xff, R_BYTES); + ctx->bit_sliced_adder(&upc, &rotated_syndrome, SLICES); + } + + // 5) Update the gray list with the relevant bits that are not + // set in the black list. + for(size_t j = 0; j < R_BYTES; j++) { + const uint8_t sum_msb = (~last_slice->raw[j]); + gray_e->val[i].raw[j] = (~(black_e->val[i].raw[j])) & sum_msb; + } + } +} + +// Recalculate the UPCs and update the errors vector (e) according to it +// and to the black/gray vectors. +_INLINE_ void find_err2(OUT e_t *e, + IN e_t * pos_e, + IN const syndrome_t * syndrome, + IN const compressed_idx_d_ar_t wlist, + IN const uint8_t threshold, + IN const decode_ctx *ctx) +{ + DEFER_CLEANUP(syndrome_t rotated_syndrome = {0}, syndrome_cleanup); + DEFER_CLEANUP(upc_t upc, upc_cleanup); + + for(uint32_t i = 0; i < N0; i++) { + // UPC must start from zero at every iteration + bike_memset(&upc, 0, sizeof(upc)); + + // 1) Right-rotate the syndrome, for every index of a set bit in the secret + // key. Then slice-add it to the UPC array. + for(size_t j = 0; j < DV; j++) { + ctx->rotate_right(&rotated_syndrome, syndrome, wlist[i].val[j]); + ctx->bit_sliced_adder(&upc, &rotated_syndrome, LOG2_MSB(j + 1)); + } + + // 2) Subtract the threshold from the UPC counters + ctx->bit_slice_full_subtract(&upc, threshold); + + // 3) Update the errors vector. + // The last slice of the UPC array holds the MSB of the accumulated values + // minus the threshold. Every zero bit indicates a potential error bit. + const r_t *last_slice = &(upc.slice[SLICES - 1].u.r.val); + for(size_t j = 0; j < R_BYTES; j++) { + const uint8_t sum_msb = (~last_slice->raw[j]); + e->val[i].raw[j] ^= (pos_e->val[i].raw[j] & sum_msb); + } + + // Ensure that the padding bits (upper bits of the last byte) are zero, so + // they are not included in the multiplication, and in the hash function. + e->val[i].raw[R_BYTES - 1] &= LAST_R_BYTE_MASK; + } +} + +ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk) +{ + // Initialize the decode methods struct + decode_ctx ctx; + decode_ctx_init(&ctx); + + DEFER_CLEANUP(e_t black_e = {0}, e_cleanup); + DEFER_CLEANUP(e_t gray_e = {0}, e_cleanup); + + DEFER_CLEANUP(pad_r_t c0 = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t h0 = {0}, pad_r_cleanup); + pad_r_t pk = {0}; + + // Pad ciphertext (c0), secret key (h0), and public key (h) + c0.val = ct->c0; + h0.val = sk->bin[0]; + pk.val = sk->pk; + + DEFER_CLEANUP(syndrome_t s = {0}, syndrome_cleanup); + DMSG(" Computing s.\n"); + POSIX_GUARD(compute_syndrome(&s, &c0, &h0, &ctx)); + ctx.dup(&s); + + // Reset (init) the error because it is xored in the find_err functions. + bike_memset(e, 0, sizeof(*e)); + + for(uint32_t iter = 0; iter < MAX_IT; iter++) { + const uint8_t threshold = get_threshold(&s); + + DMSG(" Iteration: %d\n", iter); + DMSG(" Weight of e: %lu\n", + r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1])); + DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); + + find_err1(e, &black_e, &gray_e, &s, sk->wlist, threshold, &ctx); + POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx)); +#if defined(BGF_DECODER) + if(iter >= 1) { + continue; + } +#endif + DMSG(" Weight of e: %lu\n", + r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1])); + DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); + + find_err2(e, &black_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx); + POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx)); + + DMSG(" Weight of e: %lu\n", + r_bits_vector_weight(&e->val[0]) + r_bits_vector_weight(&e->val[1])); + DMSG(" Weight of syndrome: %lu\n", r_bits_vector_weight((r_t *)s.qw)); + + find_err2(e, &gray_e, &s, sk->wlist, ((DV + 1) / 2) + 1, &ctx); + POSIX_GUARD(recompute_syndrome(&s, &c0, &h0, &pk, e, &ctx)); + } + + if(r_bits_vector_weight((r_t *)s.qw) > 0) { + BIKE_ERROR(E_DECODING_FAILURE); + } + + return SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h new file mode 100644 index 0000000000..8e405ea12e --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode.h @@ -0,0 +1,12 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "types.h" + +ret_t decode(OUT e_t *e, IN const ct_t *ct, IN const sk_t *sk); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c new file mode 100644 index 0000000000..ea8b91a499 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx2.c @@ -0,0 +1,173 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The rotate functions are based on the Barrel shifter described in [1] and + * some code snippets from [2]: + * + * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography. + * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware + * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg, + * Berlin, Heidelberg (2016) + * + * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019. + * “Optimized Implementation of QC-MDPC Code-Based Cryptography.” + * Concurrency and Computation: Practice and Experience 31 (18): + * e5089. https://doi.org/10.1002/cpe.5089. + */ + +#if defined(S2N_BIKE_R3_AVX2) + +#include "decode.h" +#include "decode_internal.h" +#include "utilities.h" + +#define AVX2_INTERNAL +#include "x86_64_intrinsic.h" + +#define R_YMM_HALF_LOG2 UPTOPOW2(R_YMM / 2) + +_INLINE_ void +rotate256_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t ymm_num) +{ + // For preventing overflows (comparison in bytes) + bike_static_assert(sizeof(*out) > + (BYTES_IN_YMM * (R_YMM + (2 * R_YMM_HALF_LOG2))), + rotr_big_err); + + *out = *in; + + for(uint32_t idx = R_YMM_HALF_LOG2; idx >= 1; idx >>= 1) { + const uint8_t mask = secure_l32_mask(ymm_num, idx); + const __m256i blend_mask = SET1_I8(mask); + ymm_num = ymm_num - (idx & mask); + + for(size_t i = 0; i < (R_YMM + idx); i++) { + __m256i a = LOAD(&out->qw[4 * (i + idx)]); + __m256i b = LOAD(&out->qw[4 * i]); + b = BLENDV_I8(b, a, blend_mask); + STORE(&out->qw[4 * i], b); + } + } +} + +_INLINE_ void +rotate256_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t count) +{ + __m256i carry_in = SET_ZERO; + const int count64 = (int)count & 0x3f; + const uint64_t count_mask = (count >> 5) & 0xe; + + __m256i idx = SET_I32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i zero_mask = SET_I64(-1, -1, -1, 0); + const __m256i count_vet = SET1_I8(count_mask); + + ALIGN(ALIGN_BYTES) + const uint8_t zero_mask2_buf[] = { + 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x86, 0x84, 0x84, 0x84, + 0x84, 0x84, 0x84, 0x84, 0x84, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82, + 0x82, 0x82, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + __m256i zero_mask2 = LOAD(zero_mask2_buf); + + zero_mask2 = SUB_I8(zero_mask2, count_vet); + idx = ADD_I8(idx, count_vet); + + for(int i = R_YMM; i >= 0; i--) { + // Load the next 256 bits + __m256i in256 = LOAD(&in->qw[4 * i]); + + // Rotate the current and previous 256 registers so that their quadwords + // would be in the right positions. + __m256i carry_out = PERMVAR_I32(in256, idx); + in256 = BLENDV_I8(carry_in, carry_out, zero_mask2); + + // Shift less than 64 (quadwords internal) + __m256i inner_carry = BLENDV_I8(carry_in, in256, zero_mask); + inner_carry = PERM_I64(inner_carry, 0x39); + const __m256i out256 = + SRLI_I64(in256, count64) | SLLI_I64(inner_carry, (int)64 - count64); + + // Store the rotated value + STORE(&out->qw[4 * i], out256); + carry_in = carry_out; + } +} + +void rotate_right_avx2(OUT syndrome_t *out, + IN const syndrome_t *in, + IN const uint32_t bitscount) +{ + // 1) Rotate in granularity of 256 bits blocks, using YMMs + rotate256_big(out, in, (bitscount / BITS_IN_YMM)); + // 2) Rotate in smaller granularity (less than 256 bits), using YMMs + rotate256_small(out, out, (bitscount % BITS_IN_YMM)); +} + +// Duplicates the first R_BITS of the syndrome three times +// |------------------------------------------| +// | Third copy | Second copy | first R_BITS | +// |------------------------------------------| +// This is required by the rotate functions. +void dup_avx2(IN OUT syndrome_t *s) +{ + s->qw[R_QWORDS - 1] = + (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK); + + for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) { + s->qw[R_QWORDS + i] = + (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD); + } +} + +// Use half-adder as described in [1]. +void bit_sliced_adder_avx2(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices) +{ + // From cache-memory perspective this loop should be the outside loop + for(size_t j = 0; j < num_of_slices; j++) { + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]); + upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i]; + rotated_syndrome->qw[i] = carry; + } + } +} + +void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val) +{ + // Borrow + uint64_t br[R_QWORDS] = {0}; + + for(size_t j = 0; j < SLICES; j++) { + + const uint64_t lsb_mask = 0 - (val & 0x1); + val >>= 1; + + // Perform a - b with c as the input/output carry + // br = 0 0 0 0 1 1 1 1 + // a = 0 0 1 1 0 0 1 1 + // b = 0 1 0 1 0 1 0 1 + // ------------------- + // o = 0 1 1 0 0 1 1 1 + // c = 0 1 0 0 1 1 0 1 + // + // o = a^b^c + // _ __ _ _ _ _ _ + // br = abc + abc + abc + abc = abc + ((a+b))c + + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t a = upc->slice[j].u.qw[i]; + const uint64_t b = lsb_mask; + const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i])); + upc->slice[j].u.qw[i] = a ^ b ^ br[i]; + br[i] = tmp; + } + } +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c new file mode 100644 index 0000000000..ef7f6d29d5 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_avx512.c @@ -0,0 +1,167 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The rotation functions are based on the Barrel shifter described in [1] + * and some modifed snippet from [2] + * [1] Chou, T.: QcBits: Constant-Time Small-Key Code-Based Cryptography. + * In: Gier-lichs, B., Poschmann, A.Y. (eds.) Cryptographic Hardware + * and Embedded Systems– CHES 2016. pp. 280–300. Springer Berlin Heidelberg, + * Berlin, Heidelberg (2016) + * + * [2] Guimarães, Antonio, Diego F Aranha, and Edson Borin. 2019. + * “Optimized Implementation of QC-MDPC Code-Based Cryptography.” + * Concurrency and Computation: Practice and Experience 31 (18): + * e5089. https://doi.org/10.1002/cpe.5089. + */ + +#if defined(S2N_BIKE_R3_AVX512) + +#include "decode.h" +#include "decode_internal.h" +#include "utilities.h" + +#define AVX512_INTERNAL +#include "x86_64_intrinsic.h" + +#define R_ZMM_HALF_LOG2 UPTOPOW2(R_ZMM / 2) + +_INLINE_ void +rotate512_big(OUT syndrome_t *out, IN const syndrome_t *in, size_t zmm_num) +{ + // For preventing overflows (comparison in bytes) + bike_static_assert(sizeof(*out) > + (BYTES_IN_ZMM * (R_ZMM + (2 * R_ZMM_HALF_LOG2))), + rotr_big_err); + *out = *in; + + for(uint32_t idx = R_ZMM_HALF_LOG2; idx >= 1; idx >>= 1) { + const uint8_t mask = secure_l32_mask(zmm_num, idx); + zmm_num = zmm_num - (idx & mask); + + for(size_t i = 0; i < (R_ZMM + idx); i++) { + const __m512i a = LOAD(&out->qw[8 * (i + idx)]); + MSTORE(&out->qw[8 * i], mask, a); + } + } +} + +// The rotate512_small function is a derivative of the code described in [1] +_INLINE_ void +rotate512_small(OUT syndrome_t *out, IN const syndrome_t *in, size_t bitscount) +{ + __m512i previous = SET_ZERO; + const int count64 = (int)bitscount & 0x3f; + const __m512i count64_512 = SET1_I64(count64); + const __m512i count64_512r = SET1_I64((int)64 - count64); + + const __m512i num_full_qw = SET1_I64(bitscount >> 6); + const __m512i one = SET1_I64(1); + __m512i a0, a1; + + __m512i idx = SET_I64(7, 6, 5, 4, 3, 2, 1, 0); + + // Positions above 7 are taken from the second register in + // _mm512_permutex2var_epi64 + idx = ADD_I64(idx, num_full_qw); + __m512i idx1 = ADD_I64(idx, one); + + for(int i = R_ZMM; i >= 0; i--) { + // Load the next 512 bits + const __m512i in512 = LOAD(&in->qw[8 * i]); + + // Rotate the current and previous 512 registers so that their quadwords + // would be in the right positions. + a0 = PERMX2VAR_I64(in512, idx, previous); + a1 = PERMX2VAR_I64(in512, idx1, previous); + + a0 = SRLV_I64(a0, count64_512); + a1 = SLLV_I64(a1, count64_512r); + + // Shift less than 64 (quadwords internal) + const __m512i out512 = a0 | a1; + + // Store the rotated value + STORE(&out->qw[8 * i], out512); + previous = in512; + } +} + +void rotate_right_avx512(OUT syndrome_t *out, + IN const syndrome_t *in, + IN const uint32_t bitscount) +{ + // 1) Rotate in granularity of 512 bits blocks, using ZMMs + rotate512_big(out, in, (bitscount / BITS_IN_ZMM)); + // 2) Rotate in smaller granularity (less than 512 bits), using ZMMs + rotate512_small(out, out, (bitscount % BITS_IN_ZMM)); +} + +// Duplicates the first R_BITS of the syndrome three times +// |------------------------------------------| +// | Third copy | Second copy | first R_BITS | +// |------------------------------------------| +// This is required by the rotate functions. +void dup_avx512(IN OUT syndrome_t *s) +{ + s->qw[R_QWORDS - 1] = + (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK); + + for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) { + s->qw[R_QWORDS + i] = + (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD); + } +} + +// Use half-adder as described in [1]. +void bit_sliced_adder_avx512(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices) +{ + // From cache-memory perspective this loop should be the outside loop + for(size_t j = 0; j < num_of_slices; j++) { + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]); + upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i]; + rotated_syndrome->qw[i] = carry; + } + } +} + +void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val) +{ + // Borrow + uint64_t br[R_QWORDS] = {0}; + + for(size_t j = 0; j < SLICES; j++) { + + const uint64_t lsb_mask = 0 - (val & 0x1); + val >>= 1; + + // Perform a - b with c as the input/output carry + // br = 0 0 0 0 1 1 1 1 + // a = 0 0 1 1 0 0 1 1 + // b = 0 1 0 1 0 1 0 1 + // ------------------- + // o = 0 1 1 0 0 1 1 1 + // c = 0 1 0 0 1 1 0 1 + // + // o = a^b^c + // _ __ _ _ _ _ _ + // br = abc + abc + abc + abc = abc + ((a+b))c + + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t a = upc->slice[j].u.qw[i]; + const uint64_t b = lsb_mask; + const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i])); + upc->slice[j].u.qw[i] = a ^ b ^ br[i]; + br[i] = tmp; + } + } +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h new file mode 100644 index 0000000000..817cc4603a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_internal.h @@ -0,0 +1,86 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "pq-crypto/s2n_pq.h" +#include "defs.h" +#include "types.h" + +// Rotate right the first R_BITS of a syndrome. +// At input, the syndrome is stored as three R_BITS triplicate. +// (this makes rotation easier to implement) +// For the output: the output syndrome has only one R_BITS rotation, the remaining +// (2 * R_BITS) bits are undefined. +void rotate_right_port(OUT syndrome_t *out, + IN const syndrome_t *in, + IN uint32_t bitscount); +void dup_port(IN OUT syndrome_t *s); +void bit_sliced_adder_port(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices); +void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val); + +#if defined(S2N_BIKE_R3_AVX2) +void rotate_right_avx2(OUT syndrome_t *out, + IN const syndrome_t *in, + IN uint32_t bitscount); +void dup_avx2(IN OUT syndrome_t *s); +void bit_sliced_adder_avx2(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices); +void bit_slice_full_subtract_avx2(OUT upc_t *upc, IN uint8_t val); +#endif + +#if defined(S2N_BIKE_R3_AVX512) +void rotate_right_avx512(OUT syndrome_t *out, + IN const syndrome_t *in, + IN uint32_t bitscount); +void dup_avx512(IN OUT syndrome_t *s); +void bit_sliced_adder_avx512(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices); +void bit_slice_full_subtract_avx512(OUT upc_t *upc, IN uint8_t val); +#endif + +// Decode methods struct +typedef struct decode_ctx_st { + void (*rotate_right)(OUT syndrome_t *out, + IN const syndrome_t *in, + IN uint32_t bitscount); + void (*dup)(IN OUT syndrome_t *s); + void (*bit_sliced_adder)(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrom, + IN const size_t num_of_slices); + void (*bit_slice_full_subtract)(OUT upc_t *upc, IN uint8_t val); +} decode_ctx; + +_INLINE_ void decode_ctx_init(decode_ctx *ctx) +{ +#if defined(S2N_BIKE_R3_AVX512) + if(s2n_bike_r3_is_avx512_enabled()) { + ctx->rotate_right = rotate_right_avx512; + ctx->dup = dup_avx512; + ctx->bit_sliced_adder = bit_sliced_adder_avx512; + ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx512; + } else +#endif +#if defined(S2N_BIKE_R3_AVX2) + if(s2n_bike_r3_is_avx2_enabled()) { + ctx->rotate_right = rotate_right_avx2; + ctx->dup = dup_avx2; + ctx->bit_sliced_adder = bit_sliced_adder_avx2; + ctx->bit_slice_full_subtract = bit_slice_full_subtract_avx2; + } else +#endif + { + ctx->rotate_right = rotate_right_port; + ctx->dup = dup_port; + ctx->bit_sliced_adder = bit_sliced_adder_port; + ctx->bit_slice_full_subtract = bit_slice_full_subtract_port; + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c new file mode 100644 index 0000000000..846818386d --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/decode_portable.c @@ -0,0 +1,126 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include "decode.h" +#include "decode_internal.h" +#include "utilities.h" + +#define R_QWORDS_HALF_LOG2 UPTOPOW2(R_QWORDS / 2) + +_INLINE_ void +rotr_big(OUT syndrome_t *out, IN const syndrome_t *in, IN size_t qw_num) +{ + // For preventing overflows (comparison in bytes) + bike_static_assert(sizeof(*out) > 8 * (R_QWORDS + (2 * R_QWORDS_HALF_LOG2)), + rotr_big_err); + + *out = *in; + + for(uint32_t idx = R_QWORDS_HALF_LOG2; idx >= 1; idx >>= 1) { + // Convert 32 bit mask to 64 bit mask + const uint64_t mask = ((uint32_t)secure_l32_mask(qw_num, idx) + 1U) - 1ULL; + qw_num = qw_num - (idx & u64_barrier(mask)); + + // Rotate R_QWORDS quadwords and another idx quadwords, + // as needed by the next iteration. + for(size_t i = 0; i < (R_QWORDS + idx); i++) { + out->qw[i] = (out->qw[i] & u64_barrier(~mask)) | + (out->qw[i + idx] & u64_barrier(mask)); + } + } +} + +_INLINE_ void +rotr_small(OUT syndrome_t *out, IN const syndrome_t *in, IN const size_t bits) +{ + bike_static_assert(bits < 64, rotr_small_err); + bike_static_assert(sizeof(*out) > (8 * R_QWORDS), rotr_small_qw_err); + + // Convert |bits| to 0/1 by using !!bits; then create a mask of 0 or + // 0xffffffffff Use high_shift to avoid undefined behaviour when doing x << 64; + const uint64_t mask = (0 - (!!bits)); + const uint64_t high_shift = (64 - bits) & u64_barrier(mask); + + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t low_part = in->qw[i] >> bits; + const uint64_t high_part = (in->qw[i + 1] << high_shift) & u64_barrier(mask); + out->qw[i] = low_part | high_part; + } +} + +void rotate_right_port(OUT syndrome_t *out, + IN const syndrome_t *in, + IN const uint32_t bitscount) +{ + // Rotate (64-bit) quad-words + rotr_big(out, in, (bitscount / 64)); + // Rotate bits (less than 64) + rotr_small(out, out, (bitscount % 64)); +} + +// Duplicates the first R_BITS of the syndrome three times +// |------------------------------------------| +// | Third copy | Second copy | first R_BITS | +// |------------------------------------------| +// This is required by the rotate functions. +void dup_port(IN OUT syndrome_t *s) +{ + s->qw[R_QWORDS - 1] = + (s->qw[0] << LAST_R_QWORD_LEAD) | (s->qw[R_QWORDS - 1] & LAST_R_QWORD_MASK); + + for(size_t i = 0; i < (2 * R_QWORDS) - 1; i++) { + s->qw[R_QWORDS + i] = + (s->qw[i] >> LAST_R_QWORD_TRAIL) | (s->qw[i + 1] << LAST_R_QWORD_LEAD); + } +} + +// Use half-adder as described in [1]. +void bit_sliced_adder_port(OUT upc_t *upc, + IN OUT syndrome_t *rotated_syndrome, + IN const size_t num_of_slices) +{ + // From cache-memory perspective this loop should be the outside loop + for(size_t j = 0; j < num_of_slices; j++) { + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t carry = (upc->slice[j].u.qw[i] & rotated_syndrome->qw[i]); + upc->slice[j].u.qw[i] ^= rotated_syndrome->qw[i]; + rotated_syndrome->qw[i] = carry; + } + } +} + +void bit_slice_full_subtract_port(OUT upc_t *upc, IN uint8_t val) +{ + // Borrow + uint64_t br[R_QWORDS] = {0}; + + for(size_t j = 0; j < SLICES; j++) { + + const uint64_t lsb_mask = 0 - (val & 0x1); + val >>= 1; + + // Perform a - b with c as the input/output carry + // br = 0 0 0 0 1 1 1 1 + // a = 0 0 1 1 0 0 1 1 + // b = 0 1 0 1 0 1 0 1 + // ------------------- + // o = 0 1 1 0 0 1 1 1 + // c = 0 1 0 0 1 1 0 1 + // + // o = a^b^c + // _ __ _ _ _ _ _ + // br = abc + abc + abc + abc = abc + ((a+b))c + + for(size_t i = 0; i < R_QWORDS; i++) { + const uint64_t a = upc->slice[j].u.qw[i]; + const uint64_t b = lsb_mask; + const uint64_t tmp = ((~a) & b & (~br[i])) | ((((~a) | b) & br[i])); + upc->slice[j].u.qw[i] = a ^ b ^ br[i]; + br[i] = tmp; + } + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h new file mode 100644 index 0000000000..ab3f5c7a32 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/defs.h @@ -0,0 +1,107 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +//////////////////////////////////////////// +// Basic defs +/////////////////////////////////////////// + +// For code clarity. +#define IN +#define OUT + +#define ALIGN(n) __attribute__((aligned(n))) +#define BIKE_UNUSED_ATT __attribute__((unused)) + +#define _INLINE_ static inline + +// In asm the symbols '==' and '?' are not allowed. Therefore, if using +// divide_and_ceil in asm files, we must ensure with static_assert its validity. +#if(__cplusplus >= 201103L) || defined(static_assert) +# define bike_static_assert(COND, MSG) static_assert(COND, "MSG") +#else +# define bike_static_assert(COND, MSG) \ + typedef char static_assertion_##MSG[(COND) ? 1 : -1] BIKE_UNUSED_ATT +#endif + +// Divide by the divider and round up to next integer +#define DIVIDE_AND_CEIL(x, divider) (((x) + (divider) - 1) / (divider)) + +// Bit manipulations +// Linux Assemblies, except for Ubuntu, cannot understand what ULL mean. +// Therefore, in that case len must be smaller than 31. +#define BIT(len) (1ULL << (len)) +#define MASK(len) (BIT(len) - 1) +#define SIZEOF_BITS(b) (sizeof(b) * 8) + +#define BYTES_IN_QWORD 0x8 +#define BYTES_IN_XMM 0x10 +#define BYTES_IN_YMM 0x20 +#define BYTES_IN_ZMM 0x40 + +#define BITS_IN_YMM (BYTES_IN_YMM * 8) +#define BITS_IN_ZMM (BYTES_IN_ZMM * 8) + +#define WORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint16_t)) +#define WORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint16_t)) + +#define QWORDS_IN_XMM (BYTES_IN_XMM / sizeof(uint64_t)) +#define QWORDS_IN_YMM (BYTES_IN_YMM / sizeof(uint64_t)) +#define QWORDS_IN_ZMM (BYTES_IN_ZMM / sizeof(uint64_t)) + +// Copied from (Kaz answer) +// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 +#define UPTOPOW2_0(v) ((v)-1) +#define UPTOPOW2_1(v) (UPTOPOW2_0(v) | (UPTOPOW2_0(v) >> 1)) +#define UPTOPOW2_2(v) (UPTOPOW2_1(v) | (UPTOPOW2_1(v) >> 2)) +#define UPTOPOW2_3(v) (UPTOPOW2_2(v) | (UPTOPOW2_2(v) >> 4)) +#define UPTOPOW2_4(v) (UPTOPOW2_3(v) | (UPTOPOW2_3(v) >> 8)) +#define UPTOPOW2_5(v) (UPTOPOW2_4(v) | (UPTOPOW2_4(v) >> 16)) + +#define UPTOPOW2(v) (UPTOPOW2_5(v) + 1) + +// Works only for 0 < v < 512 +#define LOG2_MSB(v) \ + ((v) == 0 \ + ? 0 \ + : ((v) < 2 \ + ? 1 \ + : ((v) < 4 \ + ? 2 \ + : ((v) < 8 \ + ? 3 \ + : ((v) < 16 \ + ? 4 \ + : ((v) < 32 \ + ? 5 \ + : ((v) < 64 \ + ? 6 \ + : ((v) < 128 ? 7 \ + : ((v) < 256 ? 8 : 9))))))))) + +//////////////////////////////////////////// +// Debug +/////////////////////////////////////////// + +#if defined(VERBOSE) +# include <stdio.h> + +# define DMSG(...) \ + { \ + printf(__VA_ARGS__); \ + } +#else +# define DMSG(...) +#endif + +//////////////////////////////////////////// +// Printing +/////////////////////////////////////////// +//#define PRINT_IN_BE +//#define NO_SPACE +//#define NO_NEWLINE diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c new file mode 100644 index 0000000000..9f779b7df9 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.c @@ -0,0 +1,10 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include "error.h" + +__thread _bike_err_t bike_errno; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h new file mode 100644 index 0000000000..b1b9db6d5e --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/error.h @@ -0,0 +1,33 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "utils/s2n_safety.h" + +#define SUCCESS 0 +#define FAIL (-1) + +#define ret_t int __attribute__((warn_unused_result)) + +enum _bike_err +{ + E_DECODING_FAILURE = 1, + E_AES_CTR_PRF_INIT_FAIL = 2, + E_AES_OVER_USED = 3, + EXTERNAL_LIB_ERROR_OPENSSL = 4, + E_FAIL_TO_GET_SEED = 5 +}; + +typedef enum _bike_err _bike_err_t; + +extern __thread _bike_err_t bike_errno; +#define BIKE_ERROR(x) \ + do { \ + bike_errno = (x); \ + return FAIL; \ + } while(0) diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h new file mode 100644 index 0000000000..f4cdb53a80 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x.h @@ -0,0 +1,29 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "types.h" + +// c = a+b mod (x^r - 1) +_INLINE_ void +gf2x_mod_add(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b) +{ + const uint64_t *a_qwords = (const uint64_t *)a; + const uint64_t *b_qwords = (const uint64_t *)b; + uint64_t * c_qwords = (uint64_t *)c; + + for(size_t i = 0; i < R_PADDED_QWORDS; i++) { + c_qwords[i] = a_qwords[i] ^ b_qwords[i]; + } +} + +// c = a*b mod (x^r - 1) +void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b); + +// c = a^-1 mod (x^r - 1) +void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h new file mode 100644 index 0000000000..a87478aba1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_internal.h @@ -0,0 +1,177 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +// For size_t +#include <stdlib.h> + +#include "pq-crypto/s2n_pq.h" +#include "types.h" + +// The size in quadwords of the operands in the gf2x_mul_base function +// for different implementations. +#define GF2X_PORT_BASE_QWORDS (1) +#define GF2X_PCLMUL_BASE_QWORDS (8) +#define GF2X_VPCLMUL_BASE_QWORDS (16) + +// ------------------ FUNCTIONS NEEDED FOR GF2X MULTIPLICATION ------------------ +// GF2X multiplication of a and b of size GF2X_BASE_QWORDS, c = a * b +void gf2x_mul_base_port(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b); +void karatzuba_add1_port(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len); +void karatzuba_add2_port(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len); +void karatzuba_add3_port(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len); + +// -------------------- FUNCTIONS NEEDED FOR GF2X INVERSION -------------------- +// c = a^2 +void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a); +// The k-squaring function computes c = a^(2^k) % (x^r - 1), +// It is required by inversion, where l_param is derived from k. +void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param); +// c = a mod (x^r - 1) +void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a); + +// AVX2 versions of the functions +#if defined(S2N_BIKE_R3_AVX2) +void karatzuba_add1_avx2(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len); +void karatzuba_add2_avx2(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len); +void karatzuba_add3_avx2(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len); +void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param); +void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a); +#endif + +// AVX512 versions of the functions +#if defined(S2N_BIKE_R3_AVX512) +void karatzuba_add1_avx512(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len); +void karatzuba_add2_avx512(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len); +void karatzuba_add3_avx512(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len); +void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param); +void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a); +#endif + +// PCLMUL based multiplication +#if defined(S2N_BIKE_R3_PCLMUL) +void gf2x_mul_base_pclmul(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b); +void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a); +#endif + +// VPCLMUL based multiplication +#if defined(S2N_BIKE_R3_VPCLMUL) +void gf2x_mul_base_vpclmul(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b); +void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a); +#endif + +// GF2X methods struct +typedef struct gf2x_ctx_st { + size_t mul_base_qwords; + void (*mul_base)(OUT uint64_t *c, IN const uint64_t *a, IN const uint64_t *b); + void (*karatzuba_add1)(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len); + void (*karatzuba_add2)(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len); + void (*karatzuba_add3)(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len); + + void (*sqr)(OUT dbl_pad_r_t *c, IN const pad_r_t *a); + void (*k_sqr)(OUT pad_r_t *c, IN const pad_r_t *a, IN size_t l_param); + + void (*red)(OUT pad_r_t *c, IN const dbl_pad_r_t *a); +} gf2x_ctx; + +// Used in gf2x_inv.c to avoid initializing the context many times. +void gf2x_mod_mul_with_ctx(OUT pad_r_t *c, + IN const pad_r_t *a, + IN const pad_r_t *b, + IN const gf2x_ctx *ctx); + +_INLINE_ void gf2x_ctx_init(gf2x_ctx *ctx) +{ +#if defined(S2N_BIKE_R3_AVX512) + if(s2n_bike_r3_is_avx512_enabled()) { + ctx->karatzuba_add1 = karatzuba_add1_avx512; + ctx->karatzuba_add2 = karatzuba_add2_avx512; + ctx->karatzuba_add3 = karatzuba_add3_avx512; + ctx->k_sqr = k_sqr_avx512; + ctx->red = gf2x_red_avx512; + } else +#endif +#if defined(S2N_BIKE_R3_AVX2) + if(s2n_bike_r3_is_avx2_enabled()) { + ctx->karatzuba_add1 = karatzuba_add1_avx2; + ctx->karatzuba_add2 = karatzuba_add2_avx2; + ctx->karatzuba_add3 = karatzuba_add3_avx2; + ctx->k_sqr = k_sqr_avx2; + ctx->red = gf2x_red_avx2; + } else +#endif + { + ctx->karatzuba_add1 = karatzuba_add1_port; + ctx->karatzuba_add2 = karatzuba_add2_port; + ctx->karatzuba_add3 = karatzuba_add3_port; + ctx->k_sqr = k_sqr_port; + ctx->red = gf2x_red_port; + } + +#if defined(S2N_BIKE_R3_VPCLMUL) + if(s2n_bike_r3_is_vpclmul_enabled()) { + ctx->mul_base_qwords = GF2X_VPCLMUL_BASE_QWORDS; + ctx->mul_base = gf2x_mul_base_vpclmul; + ctx->sqr = gf2x_sqr_vpclmul; + } else +#endif +#if defined(S2N_BIKE_R3_PCLMUL) + if(s2n_bike_r3_is_pclmul_enabled()) { + ctx->mul_base_qwords = GF2X_PCLMUL_BASE_QWORDS; + ctx->mul_base = gf2x_mul_base_pclmul; + ctx->sqr = gf2x_sqr_pclmul; + } else +#endif + { + ctx->mul_base_qwords = GF2X_PORT_BASE_QWORDS; + ctx->mul_base = gf2x_mul_base_port; + ctx->sqr = gf2x_sqr_port; + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c new file mode 100644 index 0000000000..bea7ee84b1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_inv.c @@ -0,0 +1,156 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The inversion algorithm in this file is based on: + * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial + * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive, + * 2020. https://eprint.iacr.org/2020/298.pdf + */ + +#include "cleanup.h" +#include "gf2x.h" +#include "gf2x_internal.h" + +// a = a^2 mod (x^r - 1) +_INLINE_ void gf2x_mod_sqr_in_place(IN OUT pad_r_t *a, + OUT dbl_pad_r_t *secure_buffer, + IN const gf2x_ctx *ctx) +{ + ctx->sqr(secure_buffer, a); + ctx->red(a, secure_buffer); +} + +// c = a^2^2^num_sqrs +_INLINE_ void repeated_squaring(OUT pad_r_t *c, + IN pad_r_t * a, + IN const size_t num_sqrs, + OUT dbl_pad_r_t *sec_buf, + IN const gf2x_ctx *ctx) +{ + c->val = a->val; + + for(size_t i = 0; i < num_sqrs; i++) { + gf2x_mod_sqr_in_place(c, sec_buf, ctx); + } +} + +// The gf2x_mod_inv function implements inversion in F_2[x]/(x^R - 1) +// based on [1](Algorithm 2). + +// In every iteration, [1](Algorithm 2) performs two exponentiations: +// exponentiation 0 (exp0) and exponentiation 1 (exp1) of the form f^(2^k). +// These exponentiations are computed either by repeated squaring of f, k times, +// or by a single k-squaring of f. The method for a specific value of k +// is chosen based on the performance of squaring and k-squaring. +// +// Benchmarks on several platforms indicate that a good threshold +// for switching from repeated squaring to k-squaring is k = 64. +#define K_SQR_THR (64) + +// k-squaring is computed by a permutation of bits of the input polynomial, +// as defined in [1](Observation 1). The required parameter for the permutation +// is l = (2^k)^-1 % R. +// Therefore, there are two sets of parameters for every exponentiation: +// - exp0_k and exp1_k +// - exp0_l and exp1_l + +// Exponentiation 0 computes f^2^2^(i-1) for 0 < i < MAX_I. +// Exponentiation 1 computes f^2^((r-2) % 2^i) for 0 < i < MAX_I, +// only when the i-th bit of (r-2) is 1. Therefore, the value 0 in +// exp1_k[i] and exp1_l[i] means that exp1 is skipped in i-th iteration. + +// To quickly generate all the required parameters in Sage: +// r = DESIRED_R +// max_i = floor(log(r-2, 2)) + 1 +// exp0_k = [2^i for i in range(max_i)] +// exp0_l = [inverse_mod((2^k) % r, r) for k in exp0_k] +// exp1_k = [(r-2)%(2^i) if ((r-2) & (1<<i)) else 0 for i in range(max_i)] +// exp1_l = [inverse_mod((2^k) % r, r) if k != 0 else 0 for k in exp1_k] + +#if(LEVEL == 1) +// The parameters below are hard-coded for R=12323 +bike_static_assert((R_BITS == 12323), gf2x_inv_r_doesnt_match_parameters); + +// MAX_I = floor(log(r-2)) + 1 +# define MAX_I (14) +# define EXP0_K_VALS \ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192 +# define EXP0_L_VALS \ + 6162, 3081, 3851, 5632, 22, 484, 119, 1838, 1742, 3106, 10650, 1608, 10157, \ + 8816 +# define EXP1_K_VALS 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 33, 4129 +# define EXP1_L_VALS 0, 0, 0, 0, 0, 6162, 0, 0, 0, 0, 0, 0, 242, 5717 + +#else +// The parameters below are hard-coded for R=24659 +bike_static_assert((R_BITS == 24659), gf2x_inv_r_doesnt_match_parameters); + +// MAX_I = floor(log(r-2)) + 1 +# define MAX_I (15) +# define EXP0_K_VALS \ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 +# define EXP0_L_VALS \ + 12330, 6165, 7706, 3564, 2711, 1139, 15053, 1258, 4388, 20524, 9538, 6393, \ + 10486, 1715, 6804 +# define EXP1_K_VALS 0, 0, 0, 0, 1, 0, 17, 0, 0, 0, 0, 0, 0, 81, 8273 +# define EXP1_L_VALS 0, 0, 0, 0, 12330, 0, 13685, 0, 0, 0, 0, 0, 0, 23678, 19056 + +#endif + +// Inversion in F_2[x]/(x^R - 1), [1](Algorithm 2). +// c = a^{-1} mod x^r-1 +void gf2x_mod_inv(OUT pad_r_t *c, IN const pad_r_t *a) +{ + // Initialize gf2x methods struct + gf2x_ctx ctx = {0}; + gf2x_ctx_init(&ctx); + + // Note that exp0/1_k/l are predefined constants that depend only on the value + // of R. This value is public. Therefore, branches in this function, which + // depends on R, are also "public". Code that releases these branches + // (taken/not-taken) does not leak secret information. + const size_t exp0_k[MAX_I] = {EXP0_K_VALS}; + const size_t exp0_l[MAX_I] = {EXP0_L_VALS}; + const size_t exp1_k[MAX_I] = {EXP1_K_VALS}; + const size_t exp1_l[MAX_I] = {EXP1_L_VALS}; + + DEFER_CLEANUP(pad_r_t f = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t g = {0}, pad_r_cleanup); + DEFER_CLEANUP(pad_r_t t = {0}, pad_r_cleanup); + DEFER_CLEANUP(dbl_pad_r_t sec_buf = {0}, dbl_pad_r_cleanup); + + // Steps 2 and 3 in [1](Algorithm 2) + f.val = a->val; + t.val = a->val; + + for(size_t i = 1; i < MAX_I; i++) { + // Step 5 in [1](Algorithm 2), exponentiation 0: g = f^2^2^(i-1) + if(exp0_k[i - 1] <= K_SQR_THR) { + repeated_squaring(&g, &f, exp0_k[i - 1], &sec_buf, &ctx); + } else { + ctx.k_sqr(&g, &f, exp0_l[i - 1]); + } + + // Step 6, [1](Algorithm 2): f = f*g + gf2x_mod_mul_with_ctx(&f, &g, &f, &ctx); + + if(exp1_k[i] != 0) { + // Step 8, [1](Algorithm 2), exponentiation 1: g = f^2^((r-2) % 2^i) + if(exp1_k[i] <= K_SQR_THR) { + repeated_squaring(&g, &f, exp1_k[i], &sec_buf, &ctx); + } else { + ctx.k_sqr(&g, &f, exp1_l[i]); + } + + // Step 9, [1](Algorithm 2): t = t*g; + gf2x_mod_mul_with_ctx(&t, &g, &t, &ctx); + } + } + + // Step 10, [1](Algorithm 2): c = t^2 + gf2x_mod_sqr_in_place(&t, &sec_buf, &ctx); + c->val = t.val; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c new file mode 100644 index 0000000000..91ed73d3f2 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx2.c @@ -0,0 +1,188 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The k-squaring algorithm in this file is based on: + * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial + * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive, + * 2020. https://eprint.iacr.org/2020/298.pdf + */ + +#if defined(S2N_BIKE_R3_AVX2) + +#include "cleanup.h" +#include "gf2x_internal.h" + +#define AVX2_INTERNAL +#include "x86_64_intrinsic.h" + +#define NUM_YMMS (2) +#define NUM_OF_VALS (NUM_YMMS * WORDS_IN_YMM) + +_INLINE_ void generate_map(OUT uint16_t *map, IN const uint16_t l_param) +{ + __m256i vmap[NUM_YMMS], vtmp[NUM_YMMS], vr, inc, zero; + + // The permutation map is generated in the following way: + // 1. for i = 0 to map size: + // 2. map[i] = (i * l_param) % r + // However, to avoid the expensive multiplication and modulo operations + // we modify the algorithm to: + // 1. map[0] = l_param + // 2. for i = 1 to map size: + // 3. map[i] = map[i - 1] + l_param + // 4. if map[i] >= r: + // 5. map[i] = map[i] - r + // This algorithm is parallelized with vector instructions by processing + // certain number of values (NUM_OF_VALS) in parallel. Therefore, + // in the beginning we need to initialize the first NUM_OF_VALS elements. + for(size_t i = 0; i < NUM_OF_VALS; i++) { + map[i] = (i * l_param) % R_BITS; + } + + vr = SET1_I16(R_BITS); + zero = SET_ZERO; + + // Set the increment vector such that adding it to vmap vectors + // gives the next NUM_OF_VALS elements of the map. AVX2 does not + // support comparison of vectors where vector elements are considered + // as unsigned integers. This is a problem when r > 2^14 because + // sum of two values can be greater than 2^15 which would make the it + // a negative number when considered as a signed 16-bit integer, + // and therefore, the condition in step 4 of the algorithm would be + // evaluated incorrectly. So, we use the following trick: + // we subtract R from the increment and modify the algorithm: + // 1. map[0] = l_param + // 2. for i = 1 to map size: + // 3. map[i] = map[i - 1] + (l_param - r) + // 4. if map[i] < 0: + // 5. map[i] = map[i] + r + inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS); + inc = SUB_I16(inc, vr); + + // Load the first NUM_OF_VALS elements in the vmap vectors + for(size_t i = 0; i < NUM_YMMS; i++) { + vmap[i] = LOAD(&map[i * WORDS_IN_YMM]); + } + + for(size_t i = NUM_YMMS; i < (R_PADDED / WORDS_IN_YMM); i += NUM_YMMS) { + for(size_t j = 0; j < NUM_YMMS; j++) { + vmap[j] = ADD_I16(vmap[j], inc); + vtmp[j] = CMPGT_I16(zero, vmap[j]); + vmap[j] = ADD_I16(vmap[j], vtmp[j] & vr); + + STORE(&map[(i + j) * WORDS_IN_YMM], vmap[j]); + } + } +} + +// Convert from bytes representation, where every byte holds a single bit, +// of the polynomial, to a binary representation where every byte +// holds 8 bits of the polynomial. +_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf) +{ + uint32_t *bin32 = (uint32_t *)bin_buf; + + for(size_t i = 0; i < R_QWORDS * 2; i++) { + __m256i t = LOAD(&bytes_buf[i * BYTES_IN_YMM]); + bin32[i] = MOVEMASK(t); + } +} + +// Convert from binary representation where every byte holds 8 bits +// of the polynomial, to byte representation where +// every byte holds a single bit of the polynomial. +_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf) +{ + // The algorithm works by taking every 32 bits of the input and converting + // them to 32 bytes where each byte holds one of the bits. The first step is + // to broadcast a 32-bit value (call it a) to all elements of vector t. + // Then t contains bytes of a in the following order: + // t = [ a3 a2 a1 a0 ... a3 a2 a1 a0 ] + // where a0 contains the first 8 bits of a, a1 the second 8 bits, etc. + // Let the output vector be [ out31 out30 ... out0 ]. We want to store + // bit 0 of a in out0 byte, bit 1 of a in out1 byte, ect. (note that + // we want to store the bit in the most significant position of a byte + // because this is required by MOVEMASK instruction used in bytes_to_bin.) + // + // Ideally, we would shuffle the bytes of t such that the byte in + // i-th position contains i-th bit of val, shift t appropriately and obtain + // the result. However, AVX2 doesn't support shift operation on bytes, only + // shifts of individual QWORDS (64 bit) and DWORDS (32 bit) are allowed. + // Consider the two least significant DWORDS of t: + // t = [ ... | a3 a2 a1 a0 | a3 a2 a1 a0 ] + // and shift them by 6 and 4 to the left, respectively, to obtain: + // t = [ ... | t7 t6 t5 t4 | t3 t2 t1 t0 ] + // where t3 = a3 << 6, t2 = a2 << 6, t1 = a1 << 6, t0 = a0 << 6, + // and t7 = a3 << 4, t6 = a2 << 4, t5 = a1 << 4, t4 = a0 << 4. + // Now we shuffle vector t to obtain vector p such that: + // p = [ ... | t12 t12 t8 t8 | t4 t4 t0 t0 ] + // Note that in every even position of the vector p we have the right byte + // of the input shifted by the required shift. The values in the odd + // positions contain the right bytes of the input but they need to be shifted + // one more time to the left by 1. By shifting each DWORD of p by 1 we get: + // q = [ ... | p7 p6 p5 p4 | p3 p2 p1 p0 ] + // where p1 = t0 << 1 = a0 << 7, p3 = t4 << 1 = 5, etc. Therefore, by + // blending p and q (taking even positions from p and odd positions from q) + // we obtain the desired result. + + __m256i t, p, q; + + const __m256i shift_mask = SET_I32(0, 2, 4, 6, 0, 2, 4, 6); + + const __m256i shuffle_mask = + SET_I8(15, 15, 11, 11, 7, 7, 3, 3, 14, 14, 10, 10, 6, 6, 2, 2, 13, 13, 9, 9, + 5, 5, 1, 1, 12, 12, 8, 8, 4, 4, 0, 0); + + const __m256i blend_mask = SET1_I16(0x00ff); + + const uint32_t *bin32 = (const uint32_t *)bin_buf; + + for(size_t i = 0; i < R_QWORDS * 2; i++) { + t = SET1_I32(bin32[i]); + t = SLLV_I32(t, shift_mask); + + p = SHUF_I8(t, shuffle_mask); + q = SLLI_I32(p, 1); + + STORE(&bytes_buf[i * 32], BLENDV_I8(p, q, blend_mask)); + } +} + +// The k-squaring function computes c = a^(2^k) % (x^r - 1). +// By [1](Observation 1), if +// a = sum_{j in supp(a)} x^j, +// then +// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r). +// Therefore, k-squaring can be computed as permutation of the bits of "a": +// pi0 : j --> (j * 2^k) % r. +// For improved performance, we compute the result by inverted permutation pi1: +// pi1 : (j * 2^-k) % r --> j. +// Input argument l_param is defined as the value (2^-k) % r. +void k_sqr_avx2(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param) +{ + ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED]; + ALIGN(ALIGN_BYTES) uint8_t a_bytes[R_PADDED]; + ALIGN(ALIGN_BYTES) uint8_t c_bytes[R_PADDED] = {0}; + + // Generate the permutation map defined by pi1 and l_param. + generate_map(map, l_param); + + bin_to_bytes(a_bytes, a); + + // Permute "a" using the generated permutation map. + for(size_t i = 0; i < R_BITS; i++) { + c_bytes[i] = a_bytes[map[i]]; + } + + bytes_to_bin(c, c_bytes); + + secure_clean(a_bytes, sizeof(a_bytes)); + secure_clean(c_bytes, sizeof(c_bytes)); +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c new file mode 100644 index 0000000000..af2c5738a8 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_avx512.c @@ -0,0 +1,135 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The k-squaring algorithm in this file is based on: + * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial + * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive, + * 2020. https://eprint.iacr.org/2020/298.pdf + */ + +#if defined(S2N_BIKE_R3_AVX512) + +#include "cleanup.h" +#include "gf2x_internal.h" + +#define AVX512_INTERNAL +#include "x86_64_intrinsic.h" + +#define NUM_ZMMS (2) +#define NUM_OF_VALS (NUM_ZMMS * WORDS_IN_ZMM) + +// clang-3.9 doesn't recognize these two macros +#if !defined(_MM_CMPINT_EQ) +# define _MM_CMPINT_EQ (0) +#endif + +#if !defined(_MM_CMPINT_NLT) +# define _MM_CMPINT_NLT (5) +#endif + +_INLINE_ void generate_map(OUT uint16_t *map, IN const size_t l_param) +{ + __m512i vmap[NUM_ZMMS], vr, inc; + __mmask32 mask[NUM_ZMMS]; + + // The permutation map is generated in the following way: + // 1. for i = 0 to map size: + // 2. map[i] = (i * l_param) % r + // However, to avoid the expensive multiplication and modulo operations + // we modify the algorithm to: + // 1. map[0] = l_param + // 2. for i = 1 to map size: + // 3. map[i] = map[i - 1] + l_param + // 4. if map[i] >= r: + // 5. map[i] = map[i] - r + // This algorithm is parallelized with vector instructions by processing + // certain number of values (NUM_OF_VALS) in parallel. Therefore, + // in the beginning we need to initialize the first NUM_OF_VALS elements. + for(size_t i = 0; i < NUM_OF_VALS; i++) { + map[i] = (i * l_param) % R_BITS; + } + + // Set the increment vector such that by adding it to vmap vectors + // we will obtain the next NUM_OF_VALS elements of the map. + inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS); + vr = SET1_I16(R_BITS); + + // Load the first NUM_OF_VALS elements in the vmap vectors + for(size_t i = 0; i < NUM_ZMMS; i++) { + vmap[i] = LOAD(&map[i * WORDS_IN_ZMM]); + } + + for(size_t i = NUM_ZMMS; i < (R_PADDED / WORDS_IN_ZMM); i += NUM_ZMMS) { + for(size_t j = 0; j < NUM_ZMMS; j++) { + vmap[j] = ADD_I16(vmap[j], inc); + mask[j] = CMPM_U16(vmap[j], vr, _MM_CMPINT_NLT); + vmap[j] = MSUB_I16(vmap[j], mask[j], vmap[j], vr); + + STORE(&map[(i + j) * WORDS_IN_ZMM], vmap[j]); + } + } +} + +// Convert from bytes representation where each byte holds a single bit +// to binary representation where each byte holds 8 bits of the polynomial +_INLINE_ void bytes_to_bin(OUT pad_r_t *bin_buf, IN const uint8_t *bytes_buf) +{ + uint64_t *bin64 = (uint64_t *)bin_buf; + + __m512i first_bit_mask = SET1_I8(1); + for(size_t i = 0; i < R_QWORDS; i++) { + __m512i t = LOAD(&bytes_buf[i * BYTES_IN_ZMM]); + bin64[i] = CMPM_U8(t, first_bit_mask, _MM_CMPINT_EQ); + } +} + +// Convert from binary representation where each byte holds 8 bits +// to byte representation where each byte holds a single bit of the polynomial +_INLINE_ void bin_to_bytes(OUT uint8_t *bytes_buf, IN const pad_r_t *bin_buf) +{ + const uint64_t *bin64 = (const uint64_t *)bin_buf; + + for(size_t i = 0; i < R_QWORDS; i++) { + __m512i t = SET1MZ_I8(bin64[i], 1); + STORE(&bytes_buf[i * BYTES_IN_ZMM], t); + } +} + +// The k-squaring function computes c = a^(2^k) % (x^r - 1), +// By [1](Observation 1), if +// a = sum_{j in supp(a)} x^j, +// then +// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r). +// Therefore, k-squaring can be computed as permutation of the bits of "a": +// pi0 : j --> (j * 2^k) % r. +// For improved performance, we compute the result by inverted permutation pi1: +// pi1 : (j * 2^-k) % r --> j. +// Input argument l_param is defined as the value (2^-k) % r. +void k_sqr_avx512(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param) +{ + ALIGN(ALIGN_BYTES) uint16_t map[R_PADDED]; + ALIGN(ALIGN_BYTES) uint8_t a_bytes[R_PADDED]; + ALIGN(ALIGN_BYTES) uint8_t c_bytes[R_PADDED] = {0}; + + // Generate the permutation map defined by pi1 and l_param. + generate_map(map, l_param); + + bin_to_bytes(a_bytes, a); + + // Permute "a" using the generated permutation map. + for(size_t i = 0; i < R_BITS; i++) { + c_bytes[i] = a_bytes[map[i]]; + } + + bytes_to_bin(c, c_bytes); + + secure_clean(a_bytes, sizeof(a_bytes)); + secure_clean(c_bytes, sizeof(c_bytes)); +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c new file mode 100644 index 0000000000..c757687f58 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_ksqr_portable.c @@ -0,0 +1,48 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + * + * The k-squaring algorithm in this file is based on: + * [1] Nir Drucker, Shay Gueron, and Dusan Kostic. 2020. "Fast polynomial + * inversion for post quantum QC-MDPC cryptography". Cryptology ePrint Archive, + * 2020. https://eprint.iacr.org/2020/298.pdf + */ + +#include "gf2x_internal.h" +#include "utilities.h" + +#define BITS_IN_BYTE (8) + +// The k-squaring function computes c = a^(2^k) % (x^r - 1), +// By [1](Observation 1), if +// a = sum_{j in supp(a)} x^j, +// then +// a^(2^k) % (x^r - 1) = sum_{j in supp(a)} x^((j * 2^k) % r). +// Therefore, k-squaring can be computed as permutation of the bits of "a": +// pi0 : j --> (j * 2^k) % r. +// For improved performance, we compute the result by inverted permutation pi1: +// pi1 : (j * 2^-k) % r --> j. +// Input argument l_param is defined as the value (2^-k) % r. +void k_sqr_port(OUT pad_r_t *c, IN const pad_r_t *a, IN const size_t l_param) +{ + bike_memset(c->val.raw, 0, sizeof(c->val)); + + // Compute the result byte by byte + size_t idx = 0; + for(size_t i = 0; i < R_BYTES; i++) { + for(size_t j = 0; j < BITS_IN_BYTE; j++, idx++) { + // Bit of "c" at position idx is set to the value of + // the bit of "a" at position pi1(idx) = (l_param * idx) % R_BITS. + size_t pos = (l_param * idx) % R_BITS; + + size_t pos_byte = pos >> 3; + size_t pos_bit = pos & 7; + uint8_t bit = (a->val.raw[pos_byte] >> pos_bit) & 1; + + c->val.raw[i] |= (bit << j); + } + } + c->val.raw[R_BYTES - 1] &= LAST_R_BYTE_MASK; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c new file mode 100644 index 0000000000..ae1d7a510a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul.c @@ -0,0 +1,113 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include <assert.h> + +#include "cleanup.h" +#include "gf2x.h" +#include "gf2x_internal.h" + +// The secure buffer size required for Karatsuba is computed by: +// size(n) = 3*n/2 + size(n/2) = 3*sum_{i}{n/2^i} < 3n +#define SECURE_BUFFER_QWORDS (3 * R_PADDED_QWORDS) + +// Karatsuba multiplication algorithm. +// Input arguments a and b are padded with zeros, here: +// - n: real number of digits in a and b (R_QWORDS) +// - n_padded: padded number of digits of a and b (assumed to be power of 2) +// A buffer sec_buf is used for storing temporary data between recursion calls. +// It might contain secrets, and therefore should be securely cleaned after +// completion. +_INLINE_ void karatzuba(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len, + IN const size_t qwords_len_pad, + uint64_t * sec_buf, + IN const gf2x_ctx *ctx) +{ + if(qwords_len <= ctx->mul_base_qwords) { + ctx->mul_base(c, a, b); + return; + } + + const size_t half_qw_len = qwords_len_pad >> 1; + + // Split a and b into low and high parts of size n_padded/2 + const uint64_t *a_lo = a; + const uint64_t *b_lo = b; + const uint64_t *a_hi = &a[half_qw_len]; + const uint64_t *b_hi = &b[half_qw_len]; + + // Split c into 4 parts of size n_padded/2 (the last ptr is not needed) + uint64_t *c0 = c; + uint64_t *c1 = &c[half_qw_len]; + uint64_t *c2 = &c[half_qw_len * 2]; + + // Allocate 3 ptrs of size n_padded/2 on sec_buf + uint64_t *alah = sec_buf; + uint64_t *blbh = &sec_buf[half_qw_len]; + uint64_t *tmp = &sec_buf[half_qw_len * 2]; + + // Move sec_buf ptr to the first free location for the next recursion call + sec_buf = &sec_buf[half_qw_len * 3]; + + // Compute a_lo*b_lo and store the result in (c1|c0) + karatzuba(c0, a_lo, b_lo, half_qw_len, half_qw_len, sec_buf, ctx); + + // If the real number of digits n is less or equal to n_padded/2 then: + // a_hi = 0 and b_hi = 0 + // and + // (a_hi|a_lo)*(b_hi|b_lo) = a_lo*b_lo + // so we can skip the remaining two multiplications + if(qwords_len > half_qw_len) { + // Compute a_hi*b_hi and store the result in (c3|c2) + karatzuba(c2, a_hi, b_hi, qwords_len - half_qw_len, half_qw_len, sec_buf, + ctx); + + // Compute alah = (a_lo + a_hi) and blbh = (b_lo + b_hi) + ctx->karatzuba_add1(alah, blbh, a, b, half_qw_len); + + // Compute (c1 + c2) and store the result in tmp + ctx->karatzuba_add2(tmp, c1, c2, half_qw_len); + + // Compute alah*blbh and store the result in (c2|c1) + karatzuba(c1, alah, blbh, half_qw_len, half_qw_len, sec_buf, ctx); + + // Add (tmp|tmp) and (c3|c0) to (c2|c1) + ctx->karatzuba_add3(c0, tmp, half_qw_len); + } +} + +void gf2x_mod_mul_with_ctx(OUT pad_r_t *c, + IN const pad_r_t *a, + IN const pad_r_t *b, + IN const gf2x_ctx *ctx) +{ + bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd); + + DEFER_CLEANUP(dbl_pad_r_t t = {0}, dbl_pad_r_cleanup); + ALIGN(ALIGN_BYTES) uint64_t secure_buffer[SECURE_BUFFER_QWORDS]; + + karatzuba((uint64_t *)&t, (const uint64_t *)a, (const uint64_t *)b, R_QWORDS, + R_PADDED_QWORDS, secure_buffer, ctx); + + ctx->red(c, &t); + + secure_clean((uint8_t *)secure_buffer, sizeof(secure_buffer)); +} + +void gf2x_mod_mul(OUT pad_r_t *c, IN const pad_r_t *a, IN const pad_r_t *b) +{ + bike_static_assert((R_PADDED_BYTES % 2 == 0), karatzuba_n_is_odd); + + // Initialize gf2x methods struct + gf2x_ctx ctx = {0}; + gf2x_ctx_init(&ctx); + + gf2x_mod_mul_with_ctx(c, a, b, &ctx); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c new file mode 100644 index 0000000000..8f9c17dc09 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx2.c @@ -0,0 +1,109 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_AVX2) + +#include <assert.h> + +#include "cleanup.h" +#include "gf2x_internal.h" + +#define AVX2_INTERNAL +#include "x86_64_intrinsic.h" + +void karatzuba_add1_avx2(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T va0, va1, vb0, vb1; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + va0 = LOAD(&a[i]); + va1 = LOAD(&a[i + qwords_len]); + vb0 = LOAD(&b[i]); + vb1 = LOAD(&b[i + qwords_len]); + + STORE(&alah[i], va0 ^ va1); + STORE(&blbh[i], vb0 ^ vb1); + } +} + +void karatzuba_add2_avx2(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vx, vy; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vx = LOAD(&x[i]); + vy = LOAD(&y[i]); + + STORE(&z[i], vx ^ vy); + } +} + +void karatzuba_add3_avx2(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vr0, vr1, vr2, vr3, vt; + + uint64_t *c0 = c; + uint64_t *c1 = &c[qwords_len]; + uint64_t *c2 = &c[2 * qwords_len]; + uint64_t *c3 = &c[3 * qwords_len]; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vr0 = LOAD(&c0[i]); + vr1 = LOAD(&c1[i]); + vr2 = LOAD(&c2[i]); + vr3 = LOAD(&c3[i]); + vt = LOAD(&mid[i]); + + STORE(&c1[i], vt ^ vr0 ^ vr1); + STORE(&c2[i], vt ^ vr2 ^ vr3); + } +} + +// c = a mod (x^r - 1) +void gf2x_red_avx2(OUT pad_r_t *c, IN const dbl_pad_r_t *a) +{ + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) { + REG_T vt0 = LOAD(&a64[i]); + REG_T vt1 = LOAD(&a64[i + R_QWORDS]); + REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]); + + vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL); + vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD); + + vt0 ^= (vt1 | vt2); + + STORE(&c64[i], vt0); + } + + c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK; + + // Clean the secrets from the upper part of c + secure_clean((uint8_t *)&c64[R_QWORDS], + (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t)); +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c new file mode 100644 index 0000000000..78ce9683ad --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_avx512.c @@ -0,0 +1,109 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_AVX512) + +#include <assert.h> + +#include "cleanup.h" +#include "gf2x_internal.h" + +#define AVX512_INTERNAL +#include "x86_64_intrinsic.h" + +void karatzuba_add1_avx512(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T va0, va1, vb0, vb1; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + va0 = LOAD(&a[i]); + va1 = LOAD(&a[i + qwords_len]); + vb0 = LOAD(&b[i]); + vb1 = LOAD(&b[i + qwords_len]); + + STORE(&alah[i], va0 ^ va1); + STORE(&blbh[i], vb0 ^ vb1); + } +} + +void karatzuba_add2_avx512(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vx, vy; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vx = LOAD(&x[i]); + vy = LOAD(&y[i]); + + STORE(&z[i], vx ^ vy); + } +} + +void karatzuba_add3_avx512(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vr0, vr1, vr2, vr3, vt; + + uint64_t *c0 = c; + uint64_t *c1 = &c[qwords_len]; + uint64_t *c2 = &c[2 * qwords_len]; + uint64_t *c3 = &c[3 * qwords_len]; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vr0 = LOAD(&c0[i]); + vr1 = LOAD(&c1[i]); + vr2 = LOAD(&c2[i]); + vr3 = LOAD(&c3[i]); + vt = LOAD(&mid[i]); + + STORE(&c1[i], vt ^ vr0 ^ vr1); + STORE(&c2[i], vt ^ vr2 ^ vr3); + } +} + +// c = a mod (x^r - 1) +void gf2x_red_avx512(OUT pad_r_t *c, IN const dbl_pad_r_t *a) +{ + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) { + REG_T vt0 = LOAD(&a64[i]); + REG_T vt1 = LOAD(&a64[i + R_QWORDS]); + REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]); + + vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL); + vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD); + + vt0 ^= (vt1 | vt2); + + STORE(&c64[i], vt0); + } + + c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK; + + // Clean the secrets from the upper part of c + secure_clean((uint8_t *)&c64[R_QWORDS], + (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t)); +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c new file mode 100644 index 0000000000..1d4553997c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_pclmul.c @@ -0,0 +1,155 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_PCLMUL) + +#include <immintrin.h> + +#include "gf2x_internal.h" + +#define LOAD128(mem) _mm_loadu_si128((const void *)(mem)) +#define STORE128(mem, reg) _mm_storeu_si128((void *)(mem), (reg)) +#define UNPACKLO(x, y) _mm_unpacklo_epi64((x), (y)) +#define UNPACKHI(x, y) _mm_unpackhi_epi64((x), (y)) +#define CLMUL(x, y, imm) _mm_clmulepi64_si128((x), (y), (imm)) +#define BSRLI(x, imm) _mm_srli_si128((x), (imm)) +#define BSLLI(x, imm) _mm_slli_si128((x), (imm)) + +// 4x4 Karatsuba multiplication +_INLINE_ void gf2x_mul4_int(OUT __m128i c[4], + IN const __m128i a_lo, + IN const __m128i a_hi, + IN const __m128i b_lo, + IN const __m128i b_hi) +{ + // a_lo = [a1 | a0]; a_hi = [a3 | a2]; + // b_lo = [b1 | b0]; b_hi = [b3 | b2]; + // 4x4 Karatsuba requires three 2x2 multiplications: + // (1) a_lo * b_lo + // (2) a_hi * b_hi + // (3) aa * bb = (a_lo + a_hi) * (b_lo + b_hi) + // Each of the three 2x2 multiplications requires three 1x1 multiplications: + // (1) is computed by a0*b0, a1*b1, (a0+a1)*(b0+b1) + // (2) is computed by a2*b2, a3*b3, (a2+a3)*(b2+b3) + // (3) is computed by aa0*bb0, aa1*bb1, (aa0+aa1)*(bb0+bb1) + // All the required additions are performed in the end. + + __m128i aa, bb; + __m128i xx, yy, uu, vv, m; + __m128i lo[2], hi[2], mi[2]; + __m128i t[9]; + + aa = a_lo ^ a_hi; + bb = b_lo ^ b_hi; + + // xx <-- [(a2+a3) | (a0+a1)] + // yy <-- [(b2+b3) | (b0+b1)] + xx = UNPACKLO(a_lo, a_hi); + yy = UNPACKLO(b_lo, b_hi); + xx = xx ^ UNPACKHI(a_lo, a_hi); + yy = yy ^ UNPACKHI(b_lo, b_hi); + + // uu <-- [ 0 | (aa0+aa1)] + // vv <-- [ 0 | (bb0+bb1)] + uu = aa ^ BSRLI(aa, 8); + vv = bb ^ BSRLI(bb, 8); + + // 9 multiplications + t[0] = CLMUL(a_lo, b_lo, 0x00); + t[1] = CLMUL(a_lo, b_lo, 0x11); + t[2] = CLMUL(a_hi, b_hi, 0x00); + t[3] = CLMUL(a_hi, b_hi, 0x11); + t[4] = CLMUL(xx, yy, 0x00); + t[5] = CLMUL(xx, yy, 0x11); + t[6] = CLMUL(aa, bb, 0x00); + t[7] = CLMUL(aa, bb, 0x11); + t[8] = CLMUL(uu, vv, 0x00); + + t[4] ^= (t[0] ^ t[1]); + t[5] ^= (t[2] ^ t[3]); + t[8] ^= (t[6] ^ t[7]); + + lo[0] = t[0] ^ BSLLI(t[4], 8); + lo[1] = t[1] ^ BSRLI(t[4], 8); + hi[0] = t[2] ^ BSLLI(t[5], 8); + hi[1] = t[3] ^ BSRLI(t[5], 8); + mi[0] = t[6] ^ BSLLI(t[8], 8); + mi[1] = t[7] ^ BSRLI(t[8], 8); + + m = lo[1] ^ hi[0]; + + c[0] = lo[0]; + c[1] = lo[0] ^ mi[0] ^ m; + c[2] = hi[1] ^ mi[1] ^ m; + c[3] = hi[1]; +} + +// 512x512bit multiplication performed by Karatsuba algorithm +// where a and b are considered as having 8 digits of size 64 bits. +void gf2x_mul_base_pclmul(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b) +{ + __m128i va[4], vb[4]; + __m128i aa[2], bb[2]; + __m128i lo[4], hi[4], mi[4], m[2]; + + for(size_t i = 0; i < 4; i++) { + va[i] = LOAD128(&a[QWORDS_IN_XMM * i]); + vb[i] = LOAD128(&b[QWORDS_IN_XMM * i]); + } + + // Multiply the low and the high halves of a and b + // lo <-- a_lo * b_lo + // hi <-- a_hi * b_hi + gf2x_mul4_int(lo, va[0], va[1], vb[0], vb[1]); + gf2x_mul4_int(hi, va[2], va[3], vb[2], vb[3]); + + // Compute the middle multiplication + // aa <-- a_lo + a_hi + // bb <-- b_lo + b_hi + // mi <-- aa * bb + aa[0] = va[0] ^ va[2]; + aa[1] = va[1] ^ va[3]; + bb[0] = vb[0] ^ vb[2]; + bb[1] = vb[1] ^ vb[3]; + gf2x_mul4_int(mi, aa[0], aa[1], bb[0], bb[1]); + + m[0] = lo[2] ^ hi[0]; + m[1] = lo[3] ^ hi[1]; + + STORE128(&c[0 * QWORDS_IN_XMM], lo[0]); + STORE128(&c[1 * QWORDS_IN_XMM], lo[1]); + STORE128(&c[2 * QWORDS_IN_XMM], mi[0] ^ lo[0] ^ m[0]); + STORE128(&c[3 * QWORDS_IN_XMM], mi[1] ^ lo[1] ^ m[1]); + STORE128(&c[4 * QWORDS_IN_XMM], mi[2] ^ hi[2] ^ m[0]); + STORE128(&c[5 * QWORDS_IN_XMM], mi[3] ^ hi[3] ^ m[1]); + STORE128(&c[6 * QWORDS_IN_XMM], hi[2]); + STORE128(&c[7 * QWORDS_IN_XMM], hi[3]); +} + +void gf2x_sqr_pclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a) +{ + __m128i va, vr0, vr1; + + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + for(size_t i = 0; i < (R_XMM * QWORDS_IN_XMM); i += QWORDS_IN_XMM) { + va = LOAD128(&a64[i]); + + vr0 = CLMUL(va, va, 0x00); + vr1 = CLMUL(va, va, 0x11); + + STORE128(&c64[i * 2], vr0); + STORE128(&c64[i * 2 + QWORDS_IN_XMM], vr1); + } +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c new file mode 100644 index 0000000000..86c21a1e28 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_portable.c @@ -0,0 +1,77 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include "gf2x_internal.h" +#include "utilities.h" + +#define LSB3(x) ((x)&7) + +// 64x64 bit multiplication +// The algorithm is based on the windowing method, for example as in: +// Brent, R. P., Gaudry, P., Thomé, E., & Zimmermann, P. (2008, May), "Faster +// multiplication in GF (2)[x]". In: International Algorithmic Number Theory +// Symposium (pp. 153-166). Springer, Berlin, Heidelberg. In this implementation, +// the last three bits are multiplied using a schoolbook multiplication. +void gf2x_mul_base_port(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b) +{ + uint64_t h = 0, l = 0, g1, g2, u[8]; + const uint64_t w = 64; + const uint64_t s = 3; + const uint64_t a0 = a[0]; + const uint64_t b0 = b[0]; + + // Multiplying 64 bits by 7 can results in an overflow of 3 bits. + // Therefore, these bits are masked out, and are treated in step 3. + const uint64_t b0m = b0 & MASK(61); + + // Step 1: Calculate a multiplication table with 8 entries. + u[0] = 0; + u[1] = b0m; + u[2] = u[1] << 1; + u[3] = u[2] ^ b0m; + u[4] = u[2] << 1; + u[5] = u[4] ^ b0m; + u[6] = u[3] << 1; + u[7] = u[6] ^ b0m; + + // Step 2: Multiply two elements in parallel in positions i, i+s + l = u[LSB3(a0)] ^ (u[LSB3(a0 >> 3)] << 3); + h = (u[LSB3(a0 >> 3)] >> 61); + + for(size_t i = (2 * s); i < w; i += (2 * s)) { + const size_t i2 = (i + s); + + g1 = u[LSB3(a0 >> i)]; + g2 = u[LSB3(a0 >> i2)]; + + l ^= (g1 << i) ^ (g2 << i2); + h ^= (g1 >> (w - i)) ^ (g2 >> (w - i2)); + } + + // Step 3: Multiply the last three bits. + for(size_t i = 61; i < 64; i++) { + uint64_t mask = (-((b0 >> i) & 1)); + l ^= ((a0 << i) & mask); + h ^= ((a0 >> (w - i)) & mask); + } + + c[0] = l; + c[1] = h; +} + +// c = a^2 +void gf2x_sqr_port(OUT dbl_pad_r_t *c, IN const pad_r_t *a) +{ + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + for(size_t i = 0; i < R_QWORDS; i++) { + gf2x_mul_base_port(&c64[2 * i], &a64[i], &a64[i]); + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c new file mode 100644 index 0000000000..c321bf355f --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_base_vpclmul.c @@ -0,0 +1,135 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_VPCLMUL) + +#include "gf2x_internal.h" + +#define AVX512_INTERNAL +#include "x86_64_intrinsic.h" + +#define CLMUL(x, y, imm) _mm512_clmulepi64_epi128((x), (y), (imm)) + +_INLINE_ void +mul2_512(OUT __m512i *h, OUT __m512i *l, IN const __m512i a, IN const __m512i b) +{ + const __m512i mask_abq = SET_I64(6, 7, 4, 5, 2, 3, 0, 1); + const __m512i s1 = a ^ PERMX_I64(a, _MM_SHUFFLE(2, 3, 0, 1)); + const __m512i s2 = b ^ PERMX_I64(b, _MM_SHUFFLE(2, 3, 0, 1)); + + __m512i lq = CLMUL(a, b, 0x00); + __m512i hq = CLMUL(a, b, 0x11); + __m512i abq = lq ^ hq ^ CLMUL(s1, s2, 0x00); + abq = PERMXVAR_I64(mask_abq, abq); + *l = MXOR_I64(lq, 0xaa, lq, abq); + *h = MXOR_I64(hq, 0x55, hq, abq); +} + +// 8x8 Karatsuba multiplication +_INLINE_ void gf2x_mul8_512_int(OUT __m512i *zh, + OUT __m512i * zl, + IN const __m512i a, + IN const __m512i b) +{ + const __m512i mask0 = SET_I64(13, 12, 5, 4, 9, 8, 1, 0); + const __m512i mask1 = SET_I64(15, 14, 7, 6, 11, 10, 3, 2); + const __m512i mask2 = SET_I64(3, 2, 1, 0, 7, 6, 5, 4); + const __m512i mask3 = SET_I64(11, 10, 9, 8, 3, 2, 1, 0); + const __m512i mask4 = SET_I64(15, 14, 13, 12, 7, 6, 5, 4); + const __m512i mask_s1 = SET_I64(7, 6, 5, 4, 1, 0, 3, 2); + const __m512i mask_s2 = SET_I64(3, 2, 7, 6, 5, 4, 1, 0); + + __m512i xl, xh, xabl, xabh, xab, xab1, xab2; + __m512i yl, yh, yabl, yabh, yab; + __m512i t[4]; + + // Calculate: + // AX1^AX3|| AX2^AX3 || AX0^AX2 || AX0^AX1 + // BX1^BX3|| BX2^BX3 || BX0^BX2 || BX0^BX1 + // Where (AX1^AX3 || AX0^AX2) stands for (AX1 || AX0)^(AX3 || AX2) = AY0^AY1 + t[0] = PERMXVAR_I64(mask_s1, a) ^ PERMXVAR_I64(mask_s2, a); + t[1] = PERMXVAR_I64(mask_s1, b) ^ PERMXVAR_I64(mask_s2, b); + + // Calculate: + // Don't care || AX1^AX3^AX0^AX2 + // Don't care || BX1^BX3^BX0^BX2 + t[2] = t[0] ^ VALIGN(t[0], t[0], 4); + t[3] = t[1] ^ VALIGN(t[1], t[1], 4); + + mul2_512(&xh, &xl, a, b); + mul2_512(&xabh, &xabl, t[0], t[1]); + mul2_512(&yabh, &yabl, t[2], t[3]); + + xab = xl ^ xh ^ PERMX2VAR_I64(xabl, mask0, xabh); + yl = PERMX2VAR_I64(xl, mask3, xh); + yh = PERMX2VAR_I64(xl, mask4, xh); + xab1 = VALIGN(xab, xab, 6); + xab2 = VALIGN(xab, xab, 2); + yl = MXOR_I64(yl, 0x3c, yl, xab1); + yh = MXOR_I64(yh, 0x3c, yh, xab2); + + __m512i oxh = PERMX2VAR_I64(xabl, mask1, xabh); + __m512i oxl = VALIGN(oxh, oxh, 4); + yab = oxl ^ oxh ^ PERMX2VAR_I64(yabl, mask0, yabh); + yab = MXOR_I64(oxh, 0x3c, oxh, VALIGN(yab, yab, 2)); + yab ^= yl ^ yh; + + // Z0 (yl) + Z1 (yab) + Z2 (yh) + yab = PERMXVAR_I64(mask2, yab); + *zl = MXOR_I64(yl, 0xf0, yl, yab); + *zh = MXOR_I64(yh, 0x0f, yh, yab); +} + +// 1024x1024 bit multiplication performed by Karatsuba algorithm. +// Here, a and b are considered as having 16 digits of size 64 bits. +void gf2x_mul_base_vpclmul(OUT uint64_t *c, + IN const uint64_t *a, + IN const uint64_t *b) +{ + const __m512i a0 = LOAD(a); + const __m512i a1 = LOAD(&a[QWORDS_IN_ZMM]); + const __m512i b0 = LOAD(b); + const __m512i b1 = LOAD(&b[QWORDS_IN_ZMM]); + + __m512i hi[2], lo[2], mi[2]; + + gf2x_mul8_512_int(&lo[1], &lo[0], a0, b0); + gf2x_mul8_512_int(&hi[1], &hi[0], a1, b1); + gf2x_mul8_512_int(&mi[1], &mi[0], a0 ^ a1, b0 ^ b1); + + __m512i m = lo[1] ^ hi[0]; + + STORE(&c[0 * QWORDS_IN_ZMM], lo[0]); + STORE(&c[1 * QWORDS_IN_ZMM], mi[0] ^ lo[0] ^ m); + STORE(&c[2 * QWORDS_IN_ZMM], mi[1] ^ hi[1] ^ m); + STORE(&c[3 * QWORDS_IN_ZMM], hi[1]); +} + +void gf2x_sqr_vpclmul(OUT dbl_pad_r_t *c, IN const pad_r_t *a) +{ + __m512i va, vm, vr0, vr1; + + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + vm = SET_I64(7, 3, 6, 2, 5, 1, 4, 0); + + for(size_t i = 0; i < (R_ZMM * QWORDS_IN_ZMM); i += QWORDS_IN_ZMM) { + va = LOAD(&a64[i]); + va = PERMXVAR_I64(vm, va); + + vr0 = CLMUL(va, va, 0x00); + vr1 = CLMUL(va, va, 0x11); + + STORE(&c64[i * 2], vr0); + STORE(&c64[i * 2 + QWORDS_IN_ZMM], vr1); + } +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c new file mode 100644 index 0000000000..187042d44c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/gf2x_mul_portable.c @@ -0,0 +1,103 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include <assert.h> + +#include "cleanup.h" +#include "gf2x_internal.h" + +#define PORTABLE_INTERNAL +#include "x86_64_intrinsic.h" + +void karatzuba_add1_port(OUT uint64_t *alah, + OUT uint64_t *blbh, + IN const uint64_t *a, + IN const uint64_t *b, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T va0, va1, vb0, vb1; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + va0 = LOAD(&a[i]); + va1 = LOAD(&a[i + qwords_len]); + vb0 = LOAD(&b[i]); + vb1 = LOAD(&b[i + qwords_len]); + + STORE(&alah[i], va0 ^ va1); + STORE(&blbh[i], vb0 ^ vb1); + } +} + +void karatzuba_add2_port(OUT uint64_t *z, + IN const uint64_t *x, + IN const uint64_t *y, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vx, vy; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vx = LOAD(&x[i]); + vy = LOAD(&y[i]); + + STORE(&z[i], vx ^ vy); + } +} + +void karatzuba_add3_port(OUT uint64_t *c, + IN const uint64_t *mid, + IN const size_t qwords_len) +{ + assert(qwords_len % REG_QWORDS == 0); + + REG_T vr0, vr1, vr2, vr3, vt; + + uint64_t *c0 = c; + uint64_t *c1 = &c[qwords_len]; + uint64_t *c2 = &c[2 * qwords_len]; + uint64_t *c3 = &c[3 * qwords_len]; + + for(size_t i = 0; i < qwords_len; i += REG_QWORDS) { + vr0 = LOAD(&c0[i]); + vr1 = LOAD(&c1[i]); + vr2 = LOAD(&c2[i]); + vr3 = LOAD(&c3[i]); + vt = LOAD(&mid[i]); + + STORE(&c1[i], vt ^ vr0 ^ vr1); + STORE(&c2[i], vt ^ vr2 ^ vr3); + } +} + +// c = a mod (x^r - 1) +void gf2x_red_port(OUT pad_r_t *c, IN const dbl_pad_r_t *a) +{ + const uint64_t *a64 = (const uint64_t *)a; + uint64_t * c64 = (uint64_t *)c; + + for(size_t i = 0; i < R_QWORDS; i += REG_QWORDS) { + REG_T vt0 = LOAD(&a64[i]); + REG_T vt1 = LOAD(&a64[i + R_QWORDS]); + REG_T vt2 = LOAD(&a64[i + R_QWORDS - 1]); + + vt1 = SLLI_I64(vt1, LAST_R_QWORD_TRAIL); + vt2 = SRLI_I64(vt2, LAST_R_QWORD_LEAD); + + vt0 ^= (vt1 | vt2); + + STORE(&c64[i], vt0); + } + + c64[R_QWORDS - 1] &= LAST_R_QWORD_MASK; + + // Clean the secrets from the upper part of c + secure_clean((uint8_t *)&c64[R_QWORDS], + (R_PADDED_QWORDS - R_QWORDS) * sizeof(uint64_t)); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c new file mode 100644 index 0000000000..a76a31ef87 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.c @@ -0,0 +1,170 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include <assert.h> + +#include "sampling.h" +#include "sampling_internal.h" + +// SIMD implementation of is_new function requires the size of wlist +// to be a multiple of the number of DWORDS in a SIMD register (REG_DWORDS). +// The function is used both for generating DV and T1 random numbers so we define +// two separate macros. +#define AVX512_REG_DWORDS (16) +#define WLIST_SIZE_ADJUSTED_D \ + (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(DV, AVX512_REG_DWORDS)) +#define WLIST_SIZE_ADJUSTED_T \ + (AVX512_REG_DWORDS * DIVIDE_AND_CEIL(T1, AVX512_REG_DWORDS)) + +// BSR returns ceil(log2(val)) +_INLINE_ uint8_t bit_scan_reverse_vartime(IN uint64_t val) +{ + // index is always smaller than 64 + uint8_t index = 0; + + while(val != 0) { + val >>= 1; + index++; + } + + return index; +} + +_INLINE_ ret_t get_rand_mod_len(OUT uint32_t * rand_pos, + IN const uint32_t len, + IN OUT aes_ctr_prf_state_t *prf_state) +{ + const uint64_t mask = MASK(bit_scan_reverse_vartime(len)); + + do { + // Generate a 32 bits (pseudo) random value. + // This can be optimized to take only 16 bits. + POSIX_GUARD(aes_ctr_prf((uint8_t *)rand_pos, prf_state, sizeof(*rand_pos))); + + // Mask relevant bits only + (*rand_pos) &= mask; + + // Break if a number that is smaller than len is found + if((*rand_pos) < len) { + break; + } + + } while(1 == 1); + + return SUCCESS; +} + +_INLINE_ void make_odd_weight(IN OUT r_t *r) +{ + if(((r_bits_vector_weight(r) % 2) == 1)) { + // Already odd + return; + } + + r->raw[0] ^= 1; +} + +// Returns an array of r pseudorandom bits. +// No restrictions exist for the top or bottom bits. +// If the generation requires an odd number, then set must_be_odd=1. +// The function uses the provided prf context. +ret_t sample_uniform_r_bits_with_fixed_prf_context( + OUT r_t *r, + IN OUT aes_ctr_prf_state_t *prf_state, + IN const must_be_odd_t must_be_odd) +{ + // Generate random data + POSIX_GUARD(aes_ctr_prf(r->raw, prf_state, R_BYTES)); + + // Mask upper bits of the MSByte + r->raw[R_BYTES - 1] &= MASK(R_BITS + 8 - (R_BYTES * 8)); + + if(must_be_odd == MUST_BE_ODD) { + make_odd_weight(r); + } + + return SUCCESS; +} + +ret_t generate_indices_mod_z(OUT idx_t * out, + IN const size_t num_indices, + IN const size_t z, + IN OUT aes_ctr_prf_state_t *prf_state, + IN const sampling_ctx *ctx) +{ + size_t ctr = 0; + + // Generate num_indices unique (pseudo) random numbers modulo z + do { + POSIX_GUARD(get_rand_mod_len(&out[ctr], z, prf_state)); + ctr += ctx->is_new(out, ctr); + } while(ctr < num_indices); + + return SUCCESS; +} + +// Returns an array of r pseudorandom bits. +// No restrictions exist for the top or bottom bits. +// If the generation requires an odd number, then set must_be_odd = MUST_BE_ODD +ret_t sample_uniform_r_bits(OUT r_t *r, + IN const seed_t * seed, + IN const must_be_odd_t must_be_odd) +{ + // For the seedexpander + DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup); + + POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); + + POSIX_GUARD(sample_uniform_r_bits_with_fixed_prf_context(r, &prf_state, must_be_odd)); + + return SUCCESS; +} + +ret_t generate_sparse_rep(OUT pad_r_t *r, + OUT idx_t *wlist, + IN OUT aes_ctr_prf_state_t *prf_state) +{ + + // Initialize the sampling context + sampling_ctx ctx; + sampling_ctx_init(&ctx); + + idx_t wlist_temp[WLIST_SIZE_ADJUSTED_D] = {0}; + + POSIX_GUARD(generate_indices_mod_z(wlist_temp, DV, R_BITS, prf_state, &ctx)); + + bike_memcpy(wlist, wlist_temp, DV * sizeof(idx_t)); + ctx.secure_set_bits(r, 0, wlist, DV); + + return SUCCESS; +} + +ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed) +{ + DEFER_CLEANUP(aes_ctr_prf_state_t prf_state = {0}, aes_ctr_prf_state_cleanup); + + POSIX_GUARD(init_aes_ctr_prf_state(&prf_state, MAX_AES_INVOKATION, seed)); + + // Initialize the sampling context + sampling_ctx ctx; + sampling_ctx_init(&ctx); + + idx_t wlist[WLIST_SIZE_ADJUSTED_T] = {0}; + POSIX_GUARD(generate_indices_mod_z(wlist, T1, N_BITS, &prf_state, &ctx)); + + // (e0, e1) hold bits 0..R_BITS-1 and R_BITS..2*R_BITS-1 of the error, resp. + ctx.secure_set_bits(&e->val[0], 0, wlist, T1); + ctx.secure_set_bits(&e->val[1], R_BITS, wlist, T1); + + // Clean the padding of the elements + PE0_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK; + PE1_RAW(e)[R_BYTES - 1] &= LAST_R_BYTE_MASK; + bike_memset(&PE0_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES); + bike_memset(&PE1_RAW(e)[R_BYTES], 0, R_PADDED_BYTES - R_BYTES); + + return SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h new file mode 100644 index 0000000000..a9d50c9bc2 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling.h @@ -0,0 +1,40 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include <stdlib.h> +#include "aes_ctr_prf.h" +#include "pq-crypto/s2n_pq_random.h" +#include "utils/s2n_result.h" +#include "utilities.h" + +typedef enum +{ + NO_RESTRICTION = 0, + MUST_BE_ODD = 1 +} must_be_odd_t; + +_INLINE_ ret_t get_seeds(OUT seeds_t *seeds) { + if(s2n_result_is_ok(s2n_get_random_bytes(seeds->seed[0].raw, sizeof(seeds_t)))) { + return SUCCESS; + } else { + BIKE_ERROR(E_FAIL_TO_GET_SEED); + } +} + +// Returns an array of r pseudorandom bits. If an odd +// weight of r is required, set must_be_odd to MUST_BE_ODD. +ret_t sample_uniform_r_bits(OUT r_t *r, + IN const seed_t *seed, + IN must_be_odd_t must_be_odd); + +ret_t generate_sparse_rep(OUT pad_r_t *r, + OUT idx_t *wlist, + IN OUT aes_ctr_prf_state_t *prf_state); + +ret_t generate_error_vector(OUT pad_e_t *e, IN const seed_t *seed); diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c new file mode 100644 index 0000000000..c23be2e86e --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx2.c @@ -0,0 +1,123 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_AVX2) + +#include <assert.h> + +#include "sampling_internal.h" + +#define AVX2_INTERNAL +#include "x86_64_intrinsic.h" + +// For improved performance, we process NUM_YMMS amount of data in parallel. +#define NUM_YMMS (4) +#define YMMS_QWORDS (QWORDS_IN_YMM * NUM_YMMS) + +void secure_set_bits_avx2(OUT pad_r_t * r, + IN const size_t first_pos, + IN const idx_t *wlist, + IN const size_t w_size) +{ + // The function assumes that the size of r is a multiple + // of the cumulative size of used YMM registers. + assert((sizeof(*r) / sizeof(uint64_t)) % YMMS_QWORDS == 0); + + // va vectors hold the bits of the output array "r" + // va_pos_qw vectors hold the qw position indices of "r" + // The algorithm works as follows: + // 1. Initialize va_pos_qw with starting positions of qw's of "r" + // va_pos_qw = (3, 2, 1, 0); + // 2. While the size of "r" is not exceeded: + // 3. For each w in wlist: + // 4. Compare the pos_qw of w with positions in va_pos_qw + // and for the position which is equal set the appropriate + // bit in va vector. + // 5. Set va_pos_qw to the next qw positions of "r" + __m256i va[NUM_YMMS], va_pos_qw[NUM_YMMS], va_mask; + __m256i w_pos_qw, w_pos_bit; + __m256i one, inc; + + uint64_t *r64 = (uint64_t *)r; + + one = SET1_I64(1); + inc = SET1_I64(QWORDS_IN_YMM); + + // 1. Initialize + va_pos_qw[0] = SET_I64(3, 2, 1, 0); + for(size_t i = 1; i < NUM_YMMS; i++) { + va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc); + } + + // va_pos_qw vectors hold qw positions 0 .. (NUM_YMMS * QWORDS_IN_YMM - 1) + // Therefore, we set the increment vector inc such that by adding it to + // va_pos_qw vectors, they hold the next YMM_QWORDS qw positions. + inc = SET1_I64(YMMS_QWORDS); + + for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += YMMS_QWORDS) { + for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { + va[va_iter] = SET_ZERO; + } + + for(size_t w_iter = 0; w_iter < w_size; w_iter++) { + int32_t w = wlist[w_iter] - first_pos; + w_pos_qw = SET1_I64(w >> 6); + w_pos_bit = SLLI_I64(one, w & MASK(6)); + + // 4. Compare the positions in va_pos_qw with w_pos_qw + // and set the appropriate bit in va + for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { + va_mask = CMPEQ_I64(va_pos_qw[va_iter], w_pos_qw); + va[va_iter] |= (va_mask & w_pos_bit); + } + } + + // 5. Set the va_pos_qw to the next qw positions of r + // and store the previously computed data in r + for(size_t va_iter = 0; va_iter < NUM_YMMS; va_iter++) { + STORE(&r64[i + (va_iter * QWORDS_IN_YMM)], va[va_iter]); + va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc); + } + } +} + +int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr) +{ + bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t); + + REG_T idx_ctr = SET1_I32(wlist[ctr]); + + for(size_t i = 0; i < ctr; i += REG_DWORDS) { + // Comparisons are done with SIMD instructions with each SIMD register + // containing REG_DWORDS elements. We compare registers element-wise: + // idx_ctr = {8 repetitions of wlist[ctr]}, with + // idx_cur = {8 consecutive elements from wlist}. + // In the last iteration we consider wlist elements only up to ctr. + + REG_T idx_cur = LOAD(&wlist[i]); + REG_T cmp_res = CMPEQ_I32(idx_ctr, idx_cur); + uint32_t check = MOVEMASK(cmp_res); + + // Handle the last iteration by appropriate masking. + if(ctr < (i + REG_DWORDS)) { + // MOVEMASK instruction in AVX2 compares corresponding bytes from + // two given vector registers and produces a 32-bit mask. On the other hand, + // we compare idx_t elements, not bytes, so we multiply by sizeof(idx_t). + check &= MASK((ctr - i) * sizeof(idx_t)); + } + + if(check != 0) { + return 0; + } + } + + return 1; +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c new file mode 100644 index 0000000000..6cab4cffea --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_avx512.c @@ -0,0 +1,123 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#if defined(S2N_BIKE_R3_AVX512) + +#include <assert.h> + +#include "sampling_internal.h" + +#define AVX512_INTERNAL +#include "x86_64_intrinsic.h" + +// For improved performance, we process NUM_ZMMS amount of data in parallel. +#define NUM_ZMMS (8) +#define ZMMS_QWORDS (QWORDS_IN_ZMM * NUM_ZMMS) + +void secure_set_bits_avx512(OUT pad_r_t * r, + IN const size_t first_pos, + IN const idx_t *wlist, + IN const size_t w_size) +{ + // The function assumes that the size of r is a multiple + // of the cumulative size of used ZMM registers. + assert((sizeof(*r) / sizeof(uint64_t)) % ZMMS_QWORDS == 0); + + // va vectors hold the bits of the output array "r" + // va_pos_qw vectors hold the qw position indices of "r" + // The algorithm works as follows: + // 1. Initialize va_pos_qw with starting positions of qw's of "r" + // va_pos_qw = (7, 6, 5, 4, 3, 2, 1, 0); + // 2. While the size of "r" is not exceeded: + // 3. For each w in wlist: + // 4. Compare the pos_qw of w with positions in va_pos_qw + // and for the position which is equal set the appropriate + // bit in va vector. + // 5. Set va_pos_qw to the next qw positions of "r" + __m512i va[NUM_ZMMS], va_pos_qw[NUM_ZMMS]; + __m512i w_pos_qw, w_pos_bit, one, inc; + __mmask8 va_mask; + + uint64_t *r64 = (uint64_t *)r; + + one = SET1_I64(1); + inc = SET1_I64(QWORDS_IN_ZMM); + + // 1. Initialize + va_pos_qw[0] = SET_I64(7, 6, 5, 4, 3, 2, 1, 0); + for(size_t i = 1; i < NUM_ZMMS; i++) { + va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc); + } + + // va_pos_qw vectors hold qw positions 0 .. (NUM_ZMMS * QWORDS_IN_ZMM - 1) + // Therefore, we set the increment vector inc such that by adding it to + // va_pos_qw vectors they hold the next ZMMS_QWORDS qw positions. + inc = SET1_I64(ZMMS_QWORDS); + + for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += ZMMS_QWORDS) { + for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { + va[va_iter] = SET_ZERO; + } + + for(size_t w_iter = 0; w_iter < w_size; w_iter++) { + int32_t w = wlist[w_iter] - first_pos; + w_pos_qw = SET1_I64(w >> 6); +#if (defined(__GNUC__) && ((__GNUC__ == 6) || (__GNUC__ == 5)) && !defined(__clang__)) || (defined(__clang__) && __clang_major__ == 3 && __clang_minor__ == 9) + // Workaround for gcc-6, gcc-5, and clang < 3.9, which do not allowing the second + // argument of SLLI to be non-immediate value. + __m512i temp = SET1_I64(w & MASK(6)); + w_pos_bit = SLLV_I64(one, temp); +#else + w_pos_bit = SLLI_I64(one, w & MASK(6)); +#endif + + // 4. Compare the positions in va_pos_qw with w_pos_qw + // and set the appropriate bit in va + for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { + va_mask = CMPMEQ_I64(va_pos_qw[va_iter], w_pos_qw); + va[va_iter] = MOR_I64(va[va_iter], va_mask, va[va_iter], w_pos_bit); + } + } + + // 5. Set the va_pos_qw to the next qw positions of r + // and store the previously computed data in r + for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) { + STORE(&r64[i + (va_iter * QWORDS_IN_ZMM)], va[va_iter]); + va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc); + } + } +} + +int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr) +{ + bike_static_assert((sizeof(idx_t) == sizeof(uint32_t)), idx_t_is_not_uint32_t); + + REG_T idx_ctr = SET1_I32(wlist[ctr]); + + for(size_t i = 0; i < ctr; i += REG_DWORDS) { + // Comparisons are done with SIMD instructions with each SIMD register + // containing REG_DWORDS elements. We compare registers element-wise: + // idx_ctr = {8 repetitions of wlist[ctr]}, with + // idx_cur = {8 consecutive elements from wlist}. + // In the last iteration we consider wlist elements only up to ctr. + + REG_T idx_cur = LOAD(&wlist[i]); + + uint16_t mask = (ctr < (i + REG_DWORDS)) ? MASK(ctr - i) : 0xffff; + uint16_t check = MCMPMEQ_I32(mask, idx_ctr, idx_cur); + + if(check != 0) { + return 0; + } + } + + return 1; +} + +#endif + +typedef int dummy_typedef_to_avoid_empty_translation_unit_warning; diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h new file mode 100644 index 0000000000..3fd68354f2 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_internal.h @@ -0,0 +1,66 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "pq-crypto/s2n_pq.h" +#include "defs.h" +#include "types.h" + +void secure_set_bits_port(OUT pad_r_t *r, + IN size_t first_pos, + IN const idx_t *wlist, + IN size_t w_size); + +// Compares wlist[ctr] to w[i] for all i < ctr. +// Returns 0 if wlist[ctr] is contained in wlist, returns 1 otherwise. +int is_new_port(IN const idx_t *wlist, IN const size_t ctr); + +#if defined(S2N_BIKE_R3_AVX2) +void secure_set_bits_avx2(OUT pad_r_t *r, + IN size_t first_pos, + IN const idx_t *wlist, + IN size_t w_size); + +int is_new_avx2(IN const idx_t *wlist, IN const size_t ctr); +#endif + +#if defined(S2N_BIKE_R3_AVX512) +void secure_set_bits_avx512(OUT pad_r_t *r, + IN size_t first_pos, + IN const idx_t *wlist, + IN size_t w_size); +int is_new_avx512(IN const idx_t *wlist, IN const size_t ctr); +#endif + +typedef struct sampling_ctx_st { + void (*secure_set_bits)(OUT pad_r_t *r, + IN size_t first_pos, + IN const idx_t *wlist, + IN size_t w_size); + int (*is_new)(IN const idx_t *wlist, IN const size_t ctr); +} sampling_ctx; + +_INLINE_ void sampling_ctx_init(sampling_ctx *ctx) +{ +#if defined(S2N_BIKE_R3_AVX512) + if(s2n_bike_r3_is_avx512_enabled()) { + ctx->secure_set_bits = secure_set_bits_avx512; + ctx->is_new = is_new_avx512; + } else +#endif +#if defined(S2N_BIKE_R3_AVX2) + if(s2n_bike_r3_is_avx2_enabled()) { + ctx->secure_set_bits = secure_set_bits_avx2; + ctx->is_new = is_new_avx2; + } else +#endif + { + ctx->secure_set_bits = secure_set_bits_port; + ctx->is_new = is_new_port; + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c new file mode 100644 index 0000000000..b670730f0a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sampling_portable.c @@ -0,0 +1,60 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include <assert.h> + +#include "sampling_internal.h" +#include "utilities.h" + +#define MAX_WLIST_SIZE (T1 > DV ? T1 : DV) + +void secure_set_bits_port(OUT pad_r_t * r, + IN const size_t first_pos, + IN const idx_t *wlist, + IN const size_t w_size) +{ + assert(w_size <= MAX_WLIST_SIZE); + + // Ideally we would like to cast r.val but it is not guaranteed to be aligned + // as the entire pad_r_t structure. Thus, we assert that the position of val + // is at the beginning of r. + bike_static_assert(offsetof(pad_r_t, val) == 0, val_wrong_pos_in_pad_r_t); + uint64_t *a64 = (uint64_t *)r; + uint64_t val, mask; + + // The size of wlist can be either DV or T. So, we set it to max(D, T) + size_t pos_qw[MAX_WLIST_SIZE]; + size_t pos_bit[MAX_WLIST_SIZE]; + + // Identify the QW position of every value, and the bit position inside this QW. + for(size_t i = 0; i < w_size; i++) { + int32_t w = wlist[i] - first_pos; + pos_qw[i] = w >> 6; + pos_bit[i] = BIT(w & MASK(6)); + } + + // Fill each QW in constant time + for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i++) { + val = 0; + for(size_t j = 0; j < w_size; j++) { + mask = (-1ULL) + (!secure_cmp32(pos_qw[j], i)); + val |= (pos_bit[j] & mask); + } + a64[i] = val; + } +} + +int is_new_port(IN const idx_t *wlist, IN const size_t ctr) +{ + for(size_t i = 0; i < ctr; i++) { + if(wlist[i] == wlist[ctr]) { + return 0; + } + } + + return 1; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h new file mode 100644 index 0000000000..1857d6e638 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/sha.h @@ -0,0 +1,43 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include "cleanup.h" +#include "error.h" +#include "types.h" +#include "utilities.h" + +#include <openssl/sha.h> + +#define SHA384_DGST_BYTES 48ULL +#define SHA384_DGST_QWORDS (SHA384_DGST_BYTES / 8) + +#define SHA512_DGST_BYTES 64ULL +#define SHA512_DGST_QWORDS (SHA512_DGST_BYTES / 8) + +typedef struct sha384_dgst_s { + union { + uint8_t raw[SHA384_DGST_BYTES]; + uint64_t qw[SHA384_DGST_QWORDS]; + } u; +} sha384_dgst_t; +bike_static_assert(sizeof(sha384_dgst_t) == SHA384_DGST_BYTES, sha384_dgst_size); + +typedef sha384_dgst_t sha_dgst_t; +CLEANUP_FUNC(sha_dgst, sha_dgst_t) + +_INLINE_ ret_t sha(OUT sha_dgst_t * dgst, + IN const uint32_t byte_len, + IN const uint8_t *msg) +{ + if(SHA384(msg, byte_len, dgst->u.raw) != NULL) { + return SUCCESS; + } + + return FAIL; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h new file mode 100644 index 0000000000..436a584f3e --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/types.h @@ -0,0 +1,120 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +#include <stddef.h> +#include <stdint.h> + +#include "bike_defs.h" +#include "error.h" + +typedef struct uint128_s { + union { + uint8_t bytes[16]; // NOLINT + uint32_t dw[4]; // NOLINT + uint64_t qw[2]; // NOLINT + } u; +} uint128_t; + +// Make sure no compiler optimizations. +#pragma pack(push, 1) + +typedef struct seed_s { + uint8_t raw[SEED_BYTES]; +} seed_t; + +typedef struct seeds_s { + seed_t seed[NUM_OF_SEEDS]; +} seeds_t; + +typedef struct r_s { + uint8_t raw[R_BYTES]; +} r_t; + +typedef struct m_s { + uint8_t raw[M_BYTES]; +} m_t; + +typedef struct e_s { + r_t val[N0]; +} e_t; + +#define E0_RAW(e) ((e)->val[0].raw) +#define E1_RAW(e) ((e)->val[1].raw) + +typedef struct ct_s { + r_t c0; + m_t c1; +} ct_t; + +typedef r_t pk_t; + +typedef struct ss_st { + uint8_t raw[SS_BYTES]; +} ss_t; + +typedef uint32_t idx_t; + +typedef struct compressed_idx_d_s { + idx_t val[DV]; +} compressed_idx_d_t; + +typedef compressed_idx_d_t compressed_idx_d_ar_t[N0]; + +// The secret key holds both representations, to avoid +// the compression in Decaps. +typedef struct sk_s { + compressed_idx_d_ar_t wlist; + r_t bin[N0]; + pk_t pk; + m_t sigma; +} sk_t; + +typedef ALIGN(sizeof(idx_t)) sk_t aligned_sk_t; + +// Pad r to the next Block +typedef struct pad_r_s { + r_t val; + uint8_t pad[R_PADDED_BYTES - sizeof(r_t)]; +} ALIGN(ALIGN_BYTES) pad_r_t; + +// Double padded r, required for multiplication and squaring +typedef struct dbl_pad_r_s { + uint8_t raw[2 * R_PADDED_BYTES]; +} ALIGN(ALIGN_BYTES) dbl_pad_r_t; + +typedef struct pad_e_s { + pad_r_t val[N0]; +} ALIGN(ALIGN_BYTES) pad_e_t; + +#define PE0_RAW(e) ((e)->val[0].val.raw) +#define PE1_RAW(e) ((e)->val[1].val.raw) + +typedef struct func_k_s { + m_t m; + r_t c0; + m_t c1; +} func_k_t; + +// For a faster rotate we triplicate the syndrome (into 3 copies) +typedef struct syndrome_s { + uint64_t qw[3 * R_QWORDS]; +} ALIGN(ALIGN_BYTES) syndrome_t; + +typedef struct upc_slice_s { + union { + pad_r_t r; + uint64_t qw[sizeof(pad_r_t) / sizeof(uint64_t)]; + } ALIGN(ALIGN_BYTES) u; +} ALIGN(ALIGN_BYTES) upc_slice_t; + +typedef struct upc_s { + upc_slice_t slice[SLICES]; +} upc_t; + +#pragma pack(pop) diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c new file mode 100644 index 0000000000..0c6ad3ea0f --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.c @@ -0,0 +1,24 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#include <inttypes.h> + +#include "utilities.h" + +#define BITS_IN_QWORD 64ULL +#define BITS_IN_BYTE 8ULL + +uint64_t r_bits_vector_weight(IN const r_t *in) +{ + uint64_t acc = 0; + for(size_t i = 0; i < (R_BYTES - 1); i++) { + acc += __builtin_popcount(in->raw[i]); + } + + acc += __builtin_popcount(in->raw[R_BYTES - 1] & LAST_R_BYTE_MASK); + return acc; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h new file mode 100644 index 0000000000..f544990a1a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/utilities.h @@ -0,0 +1,139 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +#pragma once + +// For memset +#include <string.h> + +#include "types.h" + +uint64_t r_bits_vector_weight(IN const r_t *in); + +// "VALUE_BARRIER returns |a|, but prevents GCC and Clang from reasoning about +// the returned value. This is used to mitigate compilers undoing constant-time +// code, until we can express our requirements directly in the language. +// Note the compiler is aware that |VALUE_BARRIER| has no side effects and +// always has the same output for a given input. This allows it to eliminate +// dead code, move computations across loops, and vectorize." +// See: +// https://github.com/google/boringssl/commit/92b7c89e6e8ba82924b57153bea68241cc45f658 +#if(defined(__GNUC__) || defined(__clang__)) +# define VALUE_BARRIER(name, type) \ + _INLINE_ type name##_barrier(type a) \ + { \ + __asm__("" : "+r"(a) : /* no inputs */); \ + return a; \ + } +#else +# define VALUE_BARRIER(name, type) \ + _INLINE_ type name##_barrier(type a) { return a; } +#endif + +VALUE_BARRIER(u8, uint8_t) +VALUE_BARRIER(u32, uint32_t) +VALUE_BARRIER(u64, uint64_t) + +// Comparing value in a constant time manner +_INLINE_ uint32_t secure_cmp(IN const uint8_t *a, + IN const uint8_t *b, + IN const uint32_t size) +{ + volatile uint8_t res = 0; + + for(uint32_t i = 0; i < size; ++i) { + res |= (a[i] ^ b[i]); + } + + return (0 == res); +} + +// Return 1 if the arguments are equal to each other. Return 0 otherwise. +_INLINE_ uint32_t secure_cmp32(IN const uint32_t v1, IN const uint32_t v2) +{ +#if defined(__aarch64__) + uint32_t res; + __asm__ __volatile__("cmp %w[V1], %w[V2]; \n " + "cset %w[RES], EQ; \n" + : [RES] "=r"(res) + : [V1] "r"(v1), [V2] "r"(v2) + : "cc" /*The condition code flag*/); + return res; +#elif defined(__x86_64__) || defined(__i386__) + uint32_t res; + __asm__ __volatile__("xor %%edx, %%edx; \n" + "cmp %1, %2; \n " + "sete %%dl; \n" + "mov %%edx, %0; \n" + : "=r"(res) + : "r"(v1), "r"(v2) + : "rdx"); + return res; +#else + // Insecure comparison: The main purpose of secure_cmp32 is to avoid + // branches to prevent potential side channel leaks. To do that, + // we normally leverage some special CPU instructions such as "sete" + // (for __x86_64__) and "cset" (for __aarch64__). When dealing with general + // CPU architectures, the interpretation of the line below is left for the + // compiler. It could lead to an "insecure" branch. This case needs to be + // checked individually on such platforms + // (e.g., by checking the compiler-generated assembly). + return (v1 == v2 ? 1 : 0); +#endif +} + +// Return 0 if v1 < v2, (-1) otherwise +_INLINE_ uint32_t secure_l32_mask(IN const uint32_t v1, IN const uint32_t v2) +{ +#if defined(__aarch64__) + uint32_t res; + __asm__ __volatile__("cmp %w[V2], %w[V1]; \n " + "cset %w[RES], HI; \n" + : [RES] "=r"(res) + : [V1] "r"(v1), [V2] "r"(v2) + : "cc" /*The condition code flag*/); + return (res - 1); +#elif defined(__x86_64__) || defined(__i386__) + uint32_t res; + __asm__ __volatile__("xor %%edx, %%edx; \n" + "cmp %1, %2; \n " + "setl %%dl; \n" + "dec %%edx; \n" + "mov %%edx, %0; \n" + + : "=r"(res) + : "r"(v2), "r"(v1) + : "rdx"); + + return res; +#else + // If v1 >= v2 then the subtraction result is 0^32||(v1-v2). + // else it is 1^32||(v2-v1+1). Subsequently, negating the upper + // 32 bits gives 0 if v1 < v2 and otherwise (-1). + return ~((uint32_t)(((uint64_t)v1 - (uint64_t)v2) >> 32)); +#endif +} + +// bike_memcpy avoids the undefined behaviour of memcpy when byte_len=0 +_INLINE_ void *bike_memcpy(void *dst, const void *src, size_t byte_len) +{ + if(byte_len == 0) { + return dst; + } + + return memcpy(dst, src, byte_len); +} + +// bike_memset avoids the undefined behaviour of memset when byte_len=0 +_INLINE_ void *bike_memset(void *dst, const int ch, size_t byte_len) +{ + if(byte_len == 0) { + return dst; + } + + return memset(dst, ch, byte_len); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h new file mode 100644 index 0000000000..b5c1e989bd --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/bike_r3/x86_64_intrinsic.h @@ -0,0 +1,132 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0" + * + * Written by Nir Drucker, Shay Gueron and Dusan Kostic, + * AWS Cryptographic Algorithms Group. + */ + +// This file contains definitions of macros for SIMD intrinsic functions, used +// throughout the code package. Where necessary, we add a suffix to a macro, +// and denote the type of the elements (operateds). For example, +// - I16 denotes 16-bit wide integers, +// - U64 denotes 64-bit wide unsigned integers. + +#pragma once + +#if defined(S2N_BIKE_R3_AVX2) || defined(S2N_BIKE_R3_AVX512) +# include <immintrin.h> +#endif + +// clang 3.9 doesn't recognize this macro +#if !defined(_MM_CMPINT_EQ) +# define _MM_CMPINT_EQ (0) +#endif + +// For functions in gf2x_mul.c we use exactly the same code for +// PORTABLE, AVX2, AVX512 implementations. Based on the implementation, +// we define macros for the different data types (uint64_t, __m256i, __m512i), +// and all the required operations (LOAD, STORE, >>, <<) on these types. +#if defined(AVX2_INTERNAL) + +# define REG_T __m256i + +# define LOAD(mem) _mm256_loadu_si256((const void *)(mem)) +# define STORE(mem, reg) _mm256_storeu_si256((void *)(mem), (reg)) + +# define SLLI_I64(a, imm) _mm256_slli_epi64(a, imm) +# define SRLI_I64(a, imm) _mm256_srli_epi64(a, imm) + +#elif defined(AVX512_INTERNAL) + +# define REG_T __m512i + +# define LOAD(mem) _mm512_loadu_si512((mem)) +# define STORE(mem, reg) _mm512_storeu_si512((mem), (reg)) + +# define SLLI_I64(a, imm) _mm512_slli_epi64(a, imm) +# define SRLI_I64(a, imm) _mm512_srli_epi64(a, imm) + +#elif defined(PORTABLE_INTERNAL) + +# define REG_T uint64_t + +# define LOAD(mem) (mem)[0] +# define STORE(mem, val) (mem)[0] = val + +# define SLLI_I64(a, imm) ((a) << (imm)) +# define SRLI_I64(a, imm) ((a) >> (imm)) + +#endif + +// NOLINT is used to avoid the sizeof(T)/sizeof(T) warning when REG_T is defined +// to be uint64_t +#define REG_QWORDS (sizeof(REG_T) / sizeof(uint64_t)) // NOLINT +#define REG_DWORDS (sizeof(REG_T) / sizeof(uint32_t)) // NOLINT + +// The rest of the SIMD macros that are +// required for AVX2 and AVX512 implementation. +#if defined(AVX2_INTERNAL) + +# define SET_I8(...) _mm256_set_epi8(__VA_ARGS__) +# define SET_I32(...) _mm256_set_epi32(__VA_ARGS__) +# define SET_I64(...) _mm256_set_epi64x(__VA_ARGS__) +# define SET1_I8(a) _mm256_set1_epi8(a) +# define SET1_I16(a) _mm256_set1_epi16(a) +# define SET1_I32(a) _mm256_set1_epi32(a) +# define SET1_I64(a) _mm256_set1_epi64x(a) +# define SET_ZERO _mm256_setzero_si256() + +# define ADD_I8(a, b) _mm256_add_epi8(a, b) +# define SUB_I8(a, b) _mm256_sub_epi8(a, b) +# define ADD_I16(a, b) _mm256_add_epi16(a, b) +# define SUB_I16(a, b) _mm256_sub_epi16(a, b) +# define ADD_I64(a, b) _mm256_add_epi64(a, b) +# define SRLI_I16(a, imm) _mm256_srli_epi16(a, imm) +# define SLLI_I32(a, imm) _mm256_slli_epi32(a, imm) +# define SLLV_I32(a, b) _mm256_sllv_epi32(a, b) + +# define CMPGT_I16(a, b) _mm256_cmpgt_epi16(a, b) +# define CMPEQ_I16(a, b) _mm256_cmpeq_epi16(a, b) +# define CMPEQ_I32(a, b) _mm256_cmpeq_epi32(a, b) +# define CMPEQ_I64(a, b) _mm256_cmpeq_epi64(a, b) + +# define SHUF_I8(a, b) _mm256_shuffle_epi8(a, b) +# define BLENDV_I8(a, b, mask) _mm256_blendv_epi8(a, b, mask) +# define PERMVAR_I32(a, idx) _mm256_permutevar8x32_epi32(a, idx) +# define PERM_I64(a, imm) _mm256_permute4x64_epi64(a, imm) + +# define MOVEMASK(a) _mm256_movemask_epi8(a) + +#elif defined(AVX512_INTERNAL) + +# define MSTORE(mem, mask, reg) _mm512_mask_store_epi64((mem), (mask), (reg)) + +# define SET1_I8(a) _mm512_set1_epi8(a) +# define SET1_I32(a) _mm512_set1_epi32(a) +# define SET1_I64(a) _mm512_set1_epi64(a) +# define SET1MZ_I8(mask, a) _mm512_maskz_set1_epi8(mask, a) +# define SET1_I16(a) _mm512_set1_epi16(a) +# define SET_I64(...) _mm512_set_epi64(__VA_ARGS__) +# define SET_ZERO _mm512_setzero_si512() + +# define ADD_I16(a, b) _mm512_add_epi16(a, b) +# define ADD_I64(a, b) _mm512_add_epi64(a, b) +# define MSUB_I16(src, k, a, b) _mm512_mask_sub_epi16(src, k, a, b) +# define SRLI_I16(a, imm) _mm512_srli_epi16(a, imm) +# define SRLV_I64(a, cnt) _mm512_srlv_epi64(a, cnt) +# define SLLV_I64(a, cnt) _mm512_sllv_epi64(a, cnt) +# define MOR_I64(src, mask, a, b) _mm512_mask_or_epi64(src, mask, a, b) +# define MXOR_I64(src, mask, a, b) _mm512_mask_xor_epi64(src, mask, a, b) +# define VALIGN(a, b, count) _mm512_alignr_epi64(a, b, count) + +# define CMPM_U8(a, b, cmp_op) _mm512_cmp_epu8_mask(a, b, cmp_op) +# define CMPM_U16(a, b, cmp_op) _mm512_cmp_epu16_mask(a, b, cmp_op) +# define CMPMEQ_I64(a, b) _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ) +# define MCMPMEQ_I32(mask, a, b) \ + _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ) + +# define PERMX_I64(a, imm) _mm512_permutex_epi64(a, imm) +# define PERMX2VAR_I64(a, idx, b) _mm512_permutex2var_epi64(a, idx, b) +# define PERMXVAR_I64(idx, a) _mm512_permutexvar_epi64(idx, a) + +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c index c37548326d..4c520b693f 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/indcpa.c @@ -188,7 +188,7 @@ int PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; - GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c index 9de3c1daef..5b4c088b11 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/kyber_90s_r2_kem.c @@ -22,14 +22,14 @@ * Returns 0 (success) **************************************************/ int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); size_t i; PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */ + POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */ return 0; } @@ -46,11 +46,11 @@ int kyber_512_90s_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { * Returns 0 (success) **************************************************/ int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t buf[2 * KYBER_SYMBYTES]; - GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ @@ -78,7 +78,7 @@ int kyber_512_90s_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) * On failure, ss will contain a pseudo-random value. **************************************************/ int kyber_512_90s_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); size_t i; uint8_t fail; uint8_t cmp[KYBER_CIPHERTEXTBYTES]; diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h index 720bee975a..66fc5a9484 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_90s_r2/ntt.h @@ -6,8 +6,8 @@ extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128]; extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t *poly); +void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t poly[256]); +void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t poly[256]); void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c index 233b5d8515..1b76bb9b0c 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/indcpa.c @@ -188,7 +188,7 @@ int PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; - GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c index 9871084bb4..140ec352d4 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/kyber_r2_kem.c @@ -22,14 +22,14 @@ * Returns 0 (success) **************************************************/ int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); size_t i; PQCLEAN_KYBER512_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - GUARD_AS_POSIX(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */ + POSIX_GUARD_RESULT(s2n_get_random_bytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES)); /* Value z for pseudo-random output on reject */ return 0; } @@ -46,11 +46,11 @@ int kyber_512_r2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { * Returns 0 (success) **************************************************/ int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t buf[2 * KYBER_SYMBYTES]; - GUARD_AS_POSIX(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, KYBER_SYMBYTES)); hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ @@ -78,7 +78,7 @@ int kyber_512_r2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { * On failure, ss will contain a pseudo-random value. **************************************************/ int kyber_512_r2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); size_t i; uint8_t fail; uint8_t cmp[KYBER_CIPHERTEXTBYTES]; diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h index 13e976f7d0..7885e7cdc6 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r2/ntt.h @@ -6,8 +6,8 @@ extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128]; extern const int16_t PQCLEAN_KYBER512_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER512_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER512_CLEAN_invntt(int16_t *poly); +void PQCLEAN_KYBER512_CLEAN_ntt(int16_t poly[256]); +void PQCLEAN_KYBER512_CLEAN_invntt(int16_t poly[256]); void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c new file mode 100644 index 0000000000..349442f65c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c @@ -0,0 +1,1284 @@ +/* +Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +denoted as "the implementer". + +For more information, feedback or questions, please refer to our websites: +http://keccak.noekeon.org/ +http://keyak.noekeon.org/ +http://ketje.noekeon.org/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +// extra headers are removed: smmintrin.h, wmmintrin.h and emmintrin.h + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> +#include "KeccakP-align_avx2.h" +#include "KeccakP-1600-times4-SnP_avx2.h" +#include "KeccakP-SIMD256-config_avx2.h" + +#include "KeccakP-brg_endian_avx2.h" +#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) +#error Expecting a little-endian platform +#endif + +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; +typedef __m128i V128; +typedef __m256i V256; + +#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex) + +#if defined(KeccakP1600times4_useAVX2) + #define ANDnu256(a, b) _mm256_andnot_si256(a, b) + // correcting cast-align error + // old version: #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) + #define CONST256(a) _mm256_load_si256((const void *)&(a)) + #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a)) + #define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) + // correcting cast-align error + // old version: #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) + #define LOAD256u(a) _mm256_loadu_si256((const void *)&(a)) + #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) + #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) + #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) + #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) +static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; +static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; + #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) + // correcting cast-align error + // old version: #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) + #define STORE256u(a, b) _mm256_storeu_si256((void *)&(a), b) + #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v) + #define XOR256(a, b) _mm256_xor_si256(a, b) + #define XOReq256(a, b) a = _mm256_xor_si256(a, b) + #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) + #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) + #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) + #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) + + #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \ + lanesH01 = UNPACKH( lanes0, lanes1 ), \ + lanesL23 = UNPACKL( lanes2, lanes3 ), \ + lanesH23 = UNPACKH( lanes2, lanes3 ), \ + lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \ + lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \ + lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \ + lanes3 = PERM128( lanesH01, lanesH23, 0x31 ) + + #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \ + lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \ + lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \ + lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \ + lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \ + lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \ + lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \ + lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F ) + +#endif + +#define SnP_laneLengthInBytes 8 + +void KeccakP1600times4_InitializeAll(void *states) +{ + memset(states, 0, KeccakP1600times4_statesSizeInBytes); +} + +void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) +{ + unsigned int sizeLeft = length; + unsigned int lanePosition = offset/SnP_laneLengthInBytes; + unsigned int offsetInLane = offset%SnP_laneLengthInBytes; + const unsigned char *curData = data; + UINT64 *statesAsLanes = (UINT64 *)states; + + if ((sizeLeft > 0) && (offsetInLane != 0)) { + unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; + UINT64 lane = 0; + if (bytesInLane > sizeLeft) + bytesInLane = sizeLeft; + memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane); + statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; + sizeLeft -= bytesInLane; + lanePosition++; + curData += bytesInLane; + } + + while(sizeLeft >= SnP_laneLengthInBytes) { + // correcting cast-align error + // old version: UINT64 lane = *((const UINT64*)curData); + UINT64 lane = *((const UINT64*)(const void *)curData); + statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; + sizeLeft -= SnP_laneLengthInBytes; + lanePosition++; + curData += SnP_laneLengthInBytes; + } + + if (sizeLeft > 0) { + UINT64 lane = 0; + memcpy(&lane, curData, sizeLeft); + statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; + } +} + +void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) +{ + V256 *stateAsLanes = (V256 *)states; + unsigned int i; + // correcting cast-align errors + // old version: const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData0 = (const void *)data; + // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); + const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes); + // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes); + // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes); + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + + #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + + #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ + lanes1 = LOAD256u( curData1[argIndex]),\ + lanes2 = LOAD256u( curData2[argIndex]),\ + lanes3 = LOAD256u( curData3[argIndex]),\ + INTLEAVE(),\ + XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ + XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ + XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ + XOReq256( stateAsLanes[argIndex+3], lanes3 ) + + if ( laneCount >= 16 ) { + Xor_In4( 0 ); + Xor_In4( 4 ); + Xor_In4( 8 ); + Xor_In4( 12 ); + if ( laneCount >= 20 ) { + Xor_In4( 16 ); + for(i=20; i<laneCount; i++) + Xor_In( i ); + } + else { + for(i=16; i<laneCount; i++) + Xor_In( i ); + } + } + else { + for(i=0; i<laneCount; i++) + Xor_In( i ); + } + #undef Xor_In + #undef Xor_In4 +} + +void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) +{ + unsigned int sizeLeft = length; + unsigned int lanePosition = offset/SnP_laneLengthInBytes; + unsigned int offsetInLane = offset%SnP_laneLengthInBytes; + const unsigned char *curData = data; + UINT64 *statesAsLanes = (UINT64 *)states; + + if ((sizeLeft > 0) && (offsetInLane != 0)) { + unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; + if (bytesInLane > sizeLeft) + bytesInLane = sizeLeft; + memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane); + sizeLeft -= bytesInLane; + lanePosition++; + curData += bytesInLane; + } + + while(sizeLeft >= SnP_laneLengthInBytes) { + // correcting cast-align error + // old version: UINT64 lane = *((const UINT64*)curData); + UINT64 lane = *((const UINT64*)(const void*)curData); + statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; + sizeLeft -= SnP_laneLengthInBytes; + lanePosition++; + curData += SnP_laneLengthInBytes; + } + + if (sizeLeft > 0) { + memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft); + } +} + +void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) +{ + V256 *stateAsLanes = (V256 *)states; + unsigned int i; + // correcting cast-align errors + // old version: const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData0 = (const void *)data; + // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); + const UINT64 *curData1 = (const void *)(data+laneOffset*SnP_laneLengthInBytes); + // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const void *)(data+laneOffset*2*SnP_laneLengthInBytes); + // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const void *)(data+laneOffset*3*SnP_laneLengthInBytes); + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + + #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + + #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ + lanes1 = LOAD256u( curData1[argIndex]),\ + lanes2 = LOAD256u( curData2[argIndex]),\ + lanes3 = LOAD256u( curData3[argIndex]),\ + INTLEAVE(),\ + STORE256( stateAsLanes[argIndex+0], lanes0 ),\ + STORE256( stateAsLanes[argIndex+1], lanes1 ),\ + STORE256( stateAsLanes[argIndex+2], lanes2 ),\ + STORE256( stateAsLanes[argIndex+3], lanes3 ) + + if ( laneCount >= 16 ) { + OverWr4( 0 ); + OverWr4( 4 ); + OverWr4( 8 ); + OverWr4( 12 ); + if ( laneCount >= 20 ) { + OverWr4( 16 ); + for(i=20; i<laneCount; i++) + OverWr( i ); + } + else { + for(i=16; i<laneCount; i++) + OverWr( i ); + } + } + else { + for(i=0; i<laneCount; i++) + OverWr( i ); + } + #undef OverWr + #undef OverWr4 +} + +void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount) +{ + unsigned int sizeLeft = byteCount; + unsigned int lanePosition = 0; + UINT64 *statesAsLanes = (UINT64 *)states; + + while(sizeLeft >= SnP_laneLengthInBytes) { + statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0; + sizeLeft -= SnP_laneLengthInBytes; + lanePosition++; + } + + if (sizeLeft > 0) { + memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft); + } +} + +void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length) +{ + unsigned int sizeLeft = length; + unsigned int lanePosition = offset/SnP_laneLengthInBytes; + unsigned int offsetInLane = offset%SnP_laneLengthInBytes; + unsigned char *curData = data; + const UINT64 *statesAsLanes = (const UINT64 *)states; + + if ((sizeLeft > 0) && (offsetInLane != 0)) { + unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; + if (bytesInLane > sizeLeft) + bytesInLane = sizeLeft; + // correcting cast-qual error + // old version: memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane); + memcpy( curData, ((const unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane); + sizeLeft -= bytesInLane; + lanePosition++; + curData += bytesInLane; + } + + while(sizeLeft >= SnP_laneLengthInBytes) { + // correcting cast-align error + // old version: *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; + *(UINT64*)(void*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; + sizeLeft -= SnP_laneLengthInBytes; + lanePosition++; + curData += SnP_laneLengthInBytes; + } + + if (sizeLeft > 0) { + memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft); + } +} + +void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset) +{ + // correcting cast-align errors + // old version: UINT64 *curData0 = (UINT64 *)data; + UINT64 *curData0 = (void *)data; + // old version: UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes); + UINT64 *curData1 = (void *)(data+laneOffset*1*SnP_laneLengthInBytes); + // old version: UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); + UINT64 *curData2 = (void *)(data+laneOffset*2*SnP_laneLengthInBytes); + // old version: UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); + UINT64 *curData3 = (void *)(data+laneOffset*3*SnP_laneLengthInBytes); + + const V256 *stateAsLanes = (const V256 *)states; + const UINT64 *stateAsLanes64 = (const UINT64*)states; + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + unsigned int i; + + #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \ + curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \ + curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \ + curData3[argIndex] = stateAsLanes64[4*(argIndex)+3] + + #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \ + lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \ + lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \ + lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \ + UNINTLEAVE(), \ + STORE256u( curData0[argIndex], lanes0 ), \ + STORE256u( curData1[argIndex], lanes1 ), \ + STORE256u( curData2[argIndex], lanes2 ), \ + STORE256u( curData3[argIndex], lanes3 ) + + if ( laneCount >= 16 ) { + Extr4( 0 ); + Extr4( 4 ); + Extr4( 8 ); + Extr4( 12 ); + if ( laneCount >= 20 ) { + Extr4( 16 ); + for(i=20; i<laneCount; i++) + Extr( i ); + } + else { + for(i=16; i<laneCount; i++) + Extr( i ); + } + } + else { + for(i=0; i<laneCount; i++) + Extr( i ); + } + #undef Extr + #undef Extr4 +} + +void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length) +{ + unsigned int sizeLeft = length; + unsigned int lanePosition = offset/SnP_laneLengthInBytes; + unsigned int offsetInLane = offset%SnP_laneLengthInBytes; + const unsigned char *curInput = input; + unsigned char *curOutput = output; + const UINT64 *statesAsLanes = (const UINT64 *)states; + + if ((sizeLeft > 0) && (offsetInLane != 0)) { + unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; + UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane); + if (bytesInLane > sizeLeft) + bytesInLane = sizeLeft; + sizeLeft -= bytesInLane; + do { + *(curOutput++) = *(curInput++) ^ (unsigned char)lane; + lane >>= 8; + } while ( --bytesInLane != 0); + lanePosition++; + } + + while(sizeLeft >= SnP_laneLengthInBytes) { + // correcting cast-align and cast-qual errors + // old version: *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)]; + *((UINT64*)(void*)curOutput) = *((const UINT64*)(const void*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)]; + sizeLeft -= SnP_laneLengthInBytes; + lanePosition++; + curInput += SnP_laneLengthInBytes; + curOutput += SnP_laneLengthInBytes; + } + + if (sizeLeft != 0) { + UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; + do { + *(curOutput++) = *(curInput++) ^ (unsigned char)lane; + lane >>= 8; + } while ( --sizeLeft != 0); + } +} + +void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset) +{ + // correcting cast-align and cast-qual errors + // old version: const UINT64 *curInput0 = (UINT64 *)input; + const UINT64 *curInput0 = (const void *)input; + // old version: const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes); + const UINT64 *curInput1 = (const void *)(input+laneOffset*1*SnP_laneLengthInBytes); + // old version: const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes); + const UINT64 *curInput2 = (const void *)(input+laneOffset*2*SnP_laneLengthInBytes); + // old version: const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes); + const UINT64 *curInput3 = (const void *)(input+laneOffset*3*SnP_laneLengthInBytes); + // correcting cast-align errors + // old version: UINT64 *curOutput0 = (UINT64 *)output; + UINT64 *curOutput0 = (void *)output; + // old version: UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes); + UINT64 *curOutput1 = (void *)(output+laneOffset*1*SnP_laneLengthInBytes); + // old version: UUINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes); + UINT64 *curOutput2 = (void *)(output+laneOffset*2*SnP_laneLengthInBytes); + // old version: UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes); + UINT64 *curOutput3 = (void *)(output+laneOffset*3*SnP_laneLengthInBytes); + + const V256 *stateAsLanes = (const V256 *)states; + const UINT64 *stateAsLanes64 = (const UINT64*)states; + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + unsigned int i; + + #define ExtrXor( argIndex ) \ + curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\ + curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\ + curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\ + curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3] + + #define ExtrXor4( argIndex ) \ + lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\ + lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\ + lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\ + lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\ + UNINTLEAVE(),\ + lanesL01 = LOAD256u( curInput0[argIndex]),\ + lanesH01 = LOAD256u( curInput1[argIndex]),\ + lanesL23 = LOAD256u( curInput2[argIndex]),\ + lanesH23 = LOAD256u( curInput3[argIndex]),\ + XOReq256( lanes0, lanesL01 ),\ + XOReq256( lanes1, lanesH01 ),\ + XOReq256( lanes2, lanesL23 ),\ + XOReq256( lanes3, lanesH23 ),\ + STORE256u( curOutput0[argIndex], lanes0 ),\ + STORE256u( curOutput1[argIndex], lanes1 ),\ + STORE256u( curOutput2[argIndex], lanes2 ),\ + STORE256u( curOutput3[argIndex], lanes3 ) + + if ( laneCount >= 16 ) { + ExtrXor4( 0 ); + ExtrXor4( 4 ); + ExtrXor4( 8 ); + ExtrXor4( 12 ); + if ( laneCount >= 20 ) { + ExtrXor4( 16 ); + for(i=20; i<laneCount; i++) + ExtrXor( i ); + } + else { + for(i=16; i<laneCount; i++) + ExtrXor( i ); + } + } + else { + for(i=0; i<laneCount; i++) + ExtrXor( i ); + } + #undef ExtrXor + #undef ExtrXor4 +} + +#define declareABCDE \ + V256 Aba, Abe, Abi, Abo, Abu; \ + V256 Aga, Age, Agi, Ago, Agu; \ + V256 Aka, Ake, Aki, Ako, Aku; \ + V256 Ama, Ame, Ami, Amo, Amu; \ + V256 Asa, Ase, Asi, Aso, Asu; \ + V256 Bba, Bbe, Bbi, Bbo, Bbu; \ + V256 Bga, Bge, Bgi, Bgo, Bgu; \ + V256 Bka, Bke, Bki, Bko, Bku; \ + V256 Bma, Bme, Bmi, Bmo, Bmu; \ + V256 Bsa, Bse, Bsi, Bso, Bsu; \ + V256 Ca, Ce, Ci, Co, Cu; \ + V256 Ca1, Ce1, Ci1, Co1, Cu1; \ + V256 Da, De, Di, Do, Du; \ + V256 Eba, Ebe, Ebi, Ebo, Ebu; \ + V256 Ega, Ege, Egi, Ego, Egu; \ + V256 Eka, Eke, Eki, Eko, Eku; \ + V256 Ema, Eme, Emi, Emo, Emu; \ + V256 Esa, Ese, Esi, Eso, Esu; \ + +#define prepareTheta \ + Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ + Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ + Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ + Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ + Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ + +/* --- Theta Rho Pi Chi Iota Prepare-theta */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ + Ca = E##ba; \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + Ce = E##be; \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ + Cu = E##bu; \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(Ca, E##ga); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(Ce, E##ge); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + XOReq256(Ci, E##gi); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + XOReq256(Co, E##go); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ + XOReq256(Cu, E##gu); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(Ca, E##ka); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(Ce, E##ke); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + XOReq256(Ci, E##ki); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + XOReq256(Co, E##ko); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ + XOReq256(Cu, E##ku); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(Ca, E##ma); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(Ce, E##me); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + XOReq256(Ci, E##mi); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + XOReq256(Co, E##mo); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ + XOReq256(Cu, E##mu); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(Ca, E##sa); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(Ce, E##se); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + XOReq256(Ci, E##si); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + XOReq256(Co, E##so); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ + XOReq256(Cu, E##su); \ +\ + +/* --- Theta Rho Pi Chi Iota */ +/* --- 64-bit lanes mapped to 64-bit words */ +#define thetaRhoPiChiIota(i, A, E) \ + ROL64in256(Ce1, Ce, 1); \ + Da = XOR256(Cu, Ce1); \ + ROL64in256(Ci1, Ci, 1); \ + De = XOR256(Ca, Ci1); \ + ROL64in256(Co1, Co, 1); \ + Di = XOR256(Ce, Co1); \ + ROL64in256(Cu1, Cu, 1); \ + Do = XOR256(Ci, Cu1); \ + ROL64in256(Ca1, Ca, 1); \ + Du = XOR256(Co, Ca1); \ +\ + XOReq256(A##ba, Da); \ + Bba = A##ba; \ + XOReq256(A##ge, De); \ + ROL64in256(Bbe, A##ge, 44); \ + XOReq256(A##ki, Di); \ + ROL64in256(Bbi, A##ki, 43); \ + E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ + XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \ + XOReq256(A##mo, Do); \ + ROL64in256(Bbo, A##mo, 21); \ + E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ + XOReq256(A##su, Du); \ + ROL64in256(Bbu, A##su, 14); \ + E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ + E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ + E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ +\ + XOReq256(A##bo, Do); \ + ROL64in256(Bga, A##bo, 28); \ + XOReq256(A##gu, Du); \ + ROL64in256(Bge, A##gu, 20); \ + XOReq256(A##ka, Da); \ + ROL64in256(Bgi, A##ka, 3); \ + E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ + XOReq256(A##me, De); \ + ROL64in256(Bgo, A##me, 45); \ + E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ + XOReq256(A##si, Di); \ + ROL64in256(Bgu, A##si, 61); \ + E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ + E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ + E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ +\ + XOReq256(A##be, De); \ + ROL64in256(Bka, A##be, 1); \ + XOReq256(A##gi, Di); \ + ROL64in256(Bke, A##gi, 6); \ + XOReq256(A##ko, Do); \ + ROL64in256(Bki, A##ko, 25); \ + E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ + XOReq256(A##mu, Du); \ + ROL64in256_8(Bko, A##mu); \ + E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ + XOReq256(A##sa, Da); \ + ROL64in256(Bku, A##sa, 18); \ + E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ + E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ + E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ +\ + XOReq256(A##bu, Du); \ + ROL64in256(Bma, A##bu, 27); \ + XOReq256(A##ga, Da); \ + ROL64in256(Bme, A##ga, 36); \ + XOReq256(A##ke, De); \ + ROL64in256(Bmi, A##ke, 10); \ + E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ + XOReq256(A##mi, Di); \ + ROL64in256(Bmo, A##mi, 15); \ + E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ + XOReq256(A##so, Do); \ + ROL64in256_56(Bmu, A##so); \ + E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ + E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ + E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ +\ + XOReq256(A##bi, Di); \ + ROL64in256(Bsa, A##bi, 62); \ + XOReq256(A##go, Do); \ + ROL64in256(Bse, A##go, 55); \ + XOReq256(A##ku, Du); \ + ROL64in256(Bsi, A##ku, 39); \ + E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ + XOReq256(A##ma, Da); \ + ROL64in256(Bso, A##ma, 41); \ + E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ + XOReq256(A##se, De); \ + ROL64in256(Bsu, A##se, 2); \ + E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ + E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ + E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ +\ + +static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL}; + +#define copyFromState(X, state) \ + X##ba = LOAD256(state[ 0]); \ + X##be = LOAD256(state[ 1]); \ + X##bi = LOAD256(state[ 2]); \ + X##bo = LOAD256(state[ 3]); \ + X##bu = LOAD256(state[ 4]); \ + X##ga = LOAD256(state[ 5]); \ + X##ge = LOAD256(state[ 6]); \ + X##gi = LOAD256(state[ 7]); \ + X##go = LOAD256(state[ 8]); \ + X##gu = LOAD256(state[ 9]); \ + X##ka = LOAD256(state[10]); \ + X##ke = LOAD256(state[11]); \ + X##ki = LOAD256(state[12]); \ + X##ko = LOAD256(state[13]); \ + X##ku = LOAD256(state[14]); \ + X##ma = LOAD256(state[15]); \ + X##me = LOAD256(state[16]); \ + X##mi = LOAD256(state[17]); \ + X##mo = LOAD256(state[18]); \ + X##mu = LOAD256(state[19]); \ + X##sa = LOAD256(state[20]); \ + X##se = LOAD256(state[21]); \ + X##si = LOAD256(state[22]); \ + X##so = LOAD256(state[23]); \ + X##su = LOAD256(state[24]); \ + +#define copyToState(state, X) \ + STORE256(state[ 0], X##ba); \ + STORE256(state[ 1], X##be); \ + STORE256(state[ 2], X##bi); \ + STORE256(state[ 3], X##bo); \ + STORE256(state[ 4], X##bu); \ + STORE256(state[ 5], X##ga); \ + STORE256(state[ 6], X##ge); \ + STORE256(state[ 7], X##gi); \ + STORE256(state[ 8], X##go); \ + STORE256(state[ 9], X##gu); \ + STORE256(state[10], X##ka); \ + STORE256(state[11], X##ke); \ + STORE256(state[12], X##ki); \ + STORE256(state[13], X##ko); \ + STORE256(state[14], X##ku); \ + STORE256(state[15], X##ma); \ + STORE256(state[16], X##me); \ + STORE256(state[17], X##mi); \ + STORE256(state[18], X##mo); \ + STORE256(state[19], X##mu); \ + STORE256(state[20], X##sa); \ + STORE256(state[21], X##se); \ + STORE256(state[22], X##si); \ + STORE256(state[23], X##so); \ + STORE256(state[24], X##su); \ + +#define copyStateVariables(X, Y) \ + X##ba = Y##ba; \ + X##be = Y##be; \ + X##bi = Y##bi; \ + X##bo = Y##bo; \ + X##bu = Y##bu; \ + X##ga = Y##ga; \ + X##ge = Y##ge; \ + X##gi = Y##gi; \ + X##go = Y##go; \ + X##gu = Y##gu; \ + X##ka = Y##ka; \ + X##ke = Y##ke; \ + X##ki = Y##ki; \ + X##ko = Y##ko; \ + X##ku = Y##ku; \ + X##ma = Y##ma; \ + X##me = Y##me; \ + X##mi = Y##mi; \ + X##mo = Y##mo; \ + X##mu = Y##mu; \ + X##sa = Y##sa; \ + X##se = Y##se; \ + X##si = Y##si; \ + X##so = Y##so; \ + X##su = Y##su; \ + + #ifdef KeccakP1600times4_fullUnrolling +#define FullUnrolling +#else +#define Unrolling KeccakP1600times4_unrolling +#endif +// The macro file is combined with source file directly +/*****#include "KeccakP-1600-unrolling_avx2.macros"*****/ +/*******************************************************/ +/* +Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +denoted as "the implementer". + +For more information, feedback or questions, please refer to our websites: +http://keccak.noekeon.org/ +http://keyak.noekeon.org/ +http://ketje.noekeon.org/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#if (defined(FullUnrolling)) +#define rounds24 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ + thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ + thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ + thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ + thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ + thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ + thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ + thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ + thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ + thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ + thetaRhoPiChiIotaPrepareTheta(10, A, E) \ + thetaRhoPiChiIotaPrepareTheta(11, E, A) \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (Unrolling == 12) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i+=12) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \ + } \ + +#define rounds12 \ + prepareTheta \ + thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + thetaRhoPiChiIota(23, E, A) \ + +#elif (Unrolling == 6) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=6) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ + } \ + +#elif (Unrolling == 4) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=4) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ + } \ + +#elif (Unrolling == 3) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i+=3) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + copyStateVariables(A, E) \ + } \ + +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=3) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ + copyStateVariables(A, E) \ + } \ + +#elif (Unrolling == 2) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } \ + +#elif (Unrolling == 1) +#define rounds24 \ + prepareTheta \ + for(i=0; i<24; i++) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + copyStateVariables(A, E) \ + } \ + +#define rounds12 \ + prepareTheta \ + for(i=12; i<24; i++) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + copyStateVariables(A, E) \ + } \ + +#else +#error "Unrolling is not correctly specified!" +#endif + +#define roundsN(__nrounds) \ + prepareTheta \ + i = 24 - (__nrounds); \ + if ((i&1) != 0) { \ + thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + copyStateVariables(A, E) \ + ++i; \ + } \ + for( /* empty */; i<24; i+=2) { \ + thetaRhoPiChiIotaPrepareTheta(i , A, E) \ + thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ + } + +/*******************************************************/ + +void KeccakP1600times4_PermuteAll_24rounds(void *states) +{ + V256 *statesAsLanes = (V256 *)states; + declareABCDE + #ifndef KeccakP1600times4_fullUnrolling + unsigned int i; + #endif + + copyFromState(A, statesAsLanes) + rounds24 + copyToState(statesAsLanes, A) +} + +void KeccakP1600times4_PermuteAll_12rounds(void *states) +{ + V256 *statesAsLanes = (V256 *)states; + declareABCDE + #ifndef KeccakP1600times4_fullUnrolling + unsigned int i; + #endif + + copyFromState(A, statesAsLanes) + rounds12 + copyToState(statesAsLanes, A) +} + +size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen) +{ + if (laneCount == 21) { +#if 0 + const unsigned char *dataStart = data; + const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + V256 *stateAsLanes = (V256 *)states; + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + #define Xor_In( argIndex ) \ + XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + #define Xor_In4( argIndex ) \ + lanes0 = LOAD256u( curData0[argIndex]),\ + lanes1 = LOAD256u( curData1[argIndex]),\ + lanes2 = LOAD256u( curData2[argIndex]),\ + lanes3 = LOAD256u( curData3[argIndex]),\ + INTLEAVE(),\ + XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ + XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ + XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ + XOReq256( stateAsLanes[argIndex+3], lanes3 ) + Xor_In4( 0 ); + Xor_In4( 4 ); + Xor_In4( 8 ); + Xor_In4( 12 ); + Xor_In4( 16 ); + Xor_In( 20 ); + #undef Xor_In + #undef Xor_In4 + KeccakP1600times4_PermuteAll_24rounds(states); + curData0 += laneOffsetSerial; + curData1 += laneOffsetSerial; + curData2 += laneOffsetSerial; + curData3 += laneOffsetSerial; + dataByteLen -= laneOffsetSerial*8; + } + return (const unsigned char *)curData0 - dataStart; +#else +// unsigned int i; + const unsigned char *dataStart = data; + // correcting cast-align errors + // old version: const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData0 = (const void *)data; + // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + V256 *statesAsLanes = (V256 *)states; + declareABCDE + + copyFromState(A, statesAsLanes) + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + #define XOR_In( Xxx, argIndex ) \ + XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + XOR_In( Aba, 0 ); + XOR_In( Abe, 1 ); + XOR_In( Abi, 2 ); + XOR_In( Abo, 3 ); + XOR_In( Abu, 4 ); + XOR_In( Aga, 5 ); + XOR_In( Age, 6 ); + XOR_In( Agi, 7 ); + XOR_In( Ago, 8 ); + XOR_In( Agu, 9 ); + XOR_In( Aka, 10 ); + XOR_In( Ake, 11 ); + XOR_In( Aki, 12 ); + XOR_In( Ako, 13 ); + XOR_In( Aku, 14 ); + XOR_In( Ama, 15 ); + XOR_In( Ame, 16 ); + XOR_In( Ami, 17 ); + XOR_In( Amo, 18 ); + XOR_In( Amu, 19 ); + XOR_In( Asa, 20 ); + #undef XOR_In + rounds24 + curData0 += laneOffsetSerial; + curData1 += laneOffsetSerial; + curData2 += laneOffsetSerial; + curData3 += laneOffsetSerial; + dataByteLen -= laneOffsetSerial*8; + } + copyToState(statesAsLanes, A) + return (const unsigned char *)curData0 - dataStart; +#endif + } + else { +// unsigned int i; + const unsigned char *dataStart = data; + + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); + KeccakP1600times4_PermuteAll_24rounds(states); + data += laneOffsetSerial*8; + dataByteLen -= laneOffsetSerial*8; + } + return data - dataStart; + } +} + +size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen) +{ + if (laneCount == 21) { +#if 0 + const unsigned char *dataStart = data; + const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + V256 *stateAsLanes = states; + V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; + #define Xor_In( argIndex ) \ + XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + #define Xor_In4( argIndex ) \ + lanes0 = LOAD256u( curData0[argIndex]),\ + lanes1 = LOAD256u( curData1[argIndex]),\ + lanes2 = LOAD256u( curData2[argIndex]),\ + lanes3 = LOAD256u( curData3[argIndex]),\ + INTLEAVE(),\ + XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ + XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ + XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ + XOReq256( stateAsLanes[argIndex+3], lanes3 ) + Xor_In4( 0 ); + Xor_In4( 4 ); + Xor_In4( 8 ); + Xor_In4( 12 ); + Xor_In4( 16 ); + Xor_In( 20 ); + #undef Xor_In + #undef Xor_In4 + KeccakP1600times4_PermuteAll_12rounds(states); + curData0 += laneOffsetSerial; + curData1 += laneOffsetSerial; + curData2 += laneOffsetSerial; + curData3 += laneOffsetSerial; + dataByteLen -= laneOffsetSerial*8; + } + return (const unsigned char *)curData0 - dataStart; +#else +// unsigned int i; + const unsigned char *dataStart = data; + // correcting cast-align errors + // old version: const UINT64 *curData0 = (const UINT64 *)data; + const UINT64 *curData0 = (const void *)data; + // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); + // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); + // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); + V256 *statesAsLanes = states; + declareABCDE + + copyFromState(A, statesAsLanes) + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + #define XOR_In( Xxx, argIndex ) \ + XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) + XOR_In( Aba, 0 ); + XOR_In( Abe, 1 ); + XOR_In( Abi, 2 ); + XOR_In( Abo, 3 ); + XOR_In( Abu, 4 ); + XOR_In( Aga, 5 ); + XOR_In( Age, 6 ); + XOR_In( Agi, 7 ); + XOR_In( Ago, 8 ); + XOR_In( Agu, 9 ); + XOR_In( Aka, 10 ); + XOR_In( Ake, 11 ); + XOR_In( Aki, 12 ); + XOR_In( Ako, 13 ); + XOR_In( Aku, 14 ); + XOR_In( Ama, 15 ); + XOR_In( Ame, 16 ); + XOR_In( Ami, 17 ); + XOR_In( Amo, 18 ); + XOR_In( Amu, 19 ); + XOR_In( Asa, 20 ); + #undef XOR_In + rounds12 + curData0 += laneOffsetSerial; + curData1 += laneOffsetSerial; + curData2 += laneOffsetSerial; + curData3 += laneOffsetSerial; + dataByteLen -= laneOffsetSerial*8; + } + copyToState(statesAsLanes, A) + return (const unsigned char *)curData0 - dataStart; +#endif + } + else { +// unsigned int i; + const unsigned char *dataStart = data; + + while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { + KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); + KeccakP1600times4_PermuteAll_12rounds(states); + data += laneOffsetSerial*8; + dataByteLen -= laneOffsetSerial*8; + } + return data - dataStart; + } +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h new file mode 100644 index 0000000000..2640191779 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-1600-times4-SnP_avx2.h @@ -0,0 +1,63 @@ +/* +Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +denoted as "the implementer". + +For more information, feedback or questions, please refer to our websites: +http://keccak.noekeon.org/ +http://keyak.noekeon.org/ +http://ketje.noekeon.org/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#pragma once + +/** For the documentation, see PlSnP-documentation.h. + */ + +#include "KeccakP-SIMD256-config_avx2.h" +#include "kyber512r3_params.h" +#include "kyber512r3_fips202x4_avx2.h" + +#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" +#define KeccakP1600times4_statesSizeInBytes 800 +#define KeccakP1600times4_statesAlignment 32 +#define KeccakF1600times4_FastLoop_supported +#define KeccakP1600times4_12rounds_FastLoop_supported + +#include <stddef.h> + +#define KeccakP1600times4_StaticInitialize() +#define KeccakP1600times4_InitializeAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_InitializeAll) +void KeccakP1600times4_InitializeAll(void *states); +#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ + ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) +#define KeccakP1600times4_AddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddBytes) +void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); +#define KeccakP1600times4_AddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_AddLanesAll) +void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); +#define KeccakP1600times4_OverwriteBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteBytes) +void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); +#define KeccakP1600times4_OverwriteLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) +void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); +#define KeccakP1600times4_OverwriteWithZeroes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) +void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); +#define KeccakP1600times4_PermuteAll_12rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) +void KeccakP1600times4_PermuteAll_12rounds(void *states); +#define KeccakP1600times4_PermuteAll_24rounds S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) +void KeccakP1600times4_PermuteAll_24rounds(void *states); +#define KeccakP1600times4_ExtractBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractBytes) +void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); +#define KeccakP1600times4_ExtractLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractLanesAll) +void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); +#define KeccakP1600times4_ExtractAndAddBytes S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) +void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); +#define KeccakP1600times4_ExtractAndAddLanesAll S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) +void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); +#define KeccakF1600times4_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakF1600times4_FastLoop_Absorb) +size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); +#define KeccakP1600times4_12rounds_FastLoop_Absorb S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb) +size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h new file mode 100644 index 0000000000..1c65fe29b4 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-SIMD256-config_avx2.h @@ -0,0 +1,3 @@ +#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled" +#define KeccakP1600times4_fullUnrolling +#define KeccakP1600times4_useAVX2 diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h new file mode 100644 index 0000000000..be08e84af2 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-align_avx2.h @@ -0,0 +1,31 @@ +/* +Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +denoted as "the implementer". + +For more information, feedback or questions, please refer to our websites: +http://keccak.noekeon.org/ +http://keyak.noekeon.org/ +http://ketje.noekeon.org/ + +To the extent possible under law, the implementer has waived all copyright +and related or neighboring rights to the source code in this file. +http://creativecommons.org/publicdomain/zero/1.0/ +*/ + +#pragma once + +/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */ +#ifdef ALIGN +#undef ALIGN +#endif + +#if defined(__GNUC__) +#define ALIGN(x) __attribute__ ((aligned(x))) +#elif defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#elif defined(__ARMCC_VERSION) +#define ALIGN(x) __align(x) +#else +#define ALIGN(x) +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h new file mode 100644 index 0000000000..8e8b73cf2a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/KeccakP-brg_endian_avx2.h @@ -0,0 +1,139 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + Changes for ARM 9/9/2010 +*/ + +#pragma once + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#if 0 +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include <sys/isa_defs.h> +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include <sys/endian.h> +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include <machine/endian.h> +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include <endian.h> +# if !defined( __BEOS__ ) +# include <byteswap.h> +# endif +# endif +#endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif defined(__arm__) +# ifdef __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# else +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#endif + +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h new file mode 100644 index 0000000000..79e6d9ec0c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_align_avx2.h @@ -0,0 +1,19 @@ +#pragma once + +#include <stdint.h> + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[(N+31)/32]; \ + } + +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[N]; \ + __m256i vec[(N+15)/16]; \ + } +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S new file mode 100644 index 0000000000..ed2a65be20 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_basemul_avx2.S @@ -0,0 +1,105 @@ +#include "kyber512r3_consts_avx2.h" + +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 + +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo + +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 + +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi + +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 + +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi + +vmovdqa %ymm13,(%rsp) + +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) +.endm + +.text +.global cdecl(basemul_avx2_asm) +cdecl(basemul_avx2_asm): +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp + +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 + +add $32*2,%r9 +schoolbook 1 + +add $192*2,%r9 +schoolbook 2 + +add $32*2,%r9 +schoolbook 3 + +mov %r8,%rsp +ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c new file mode 100644 index 0000000000..ef0bb87946 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.c @@ -0,0 +1,104 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_cbd.h" + +/************************************************* +* Name: load32_littleendian +* +* Description: load 4 bytes into a 32-bit integer +* in little-endian order +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x +**************************************************/ +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +/************************************************* +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + + +/************************************************* +* Name: cbd2 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +static void cbd2(poly *r, const uint8_t buf[2 * S2N_KYBER_512_R3_N / 4]) { + unsigned int i, j; + + for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) { + uint32_t t = load32_littleendian(buf + 4 * i); + uint32_t d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) { + int16_t a = (d >> (4 * j + 0)) & 0x3; + int16_t b = (d >> (4 * j + 2)) & 0x3; + r->coeffs[8 * i + j] = a - b; + } + } +} + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3 +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +static void cbd3(poly *r, const uint8_t buf[3 * S2N_KYBER_512_R3_N / 4]) { + unsigned int i, j; + + for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) { + uint32_t t = load24_littleendian(buf + 3 * i); + uint32_t d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + int16_t a = (d >> (6 * j + 0)) & 0x7; + int16_t b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} + +void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]) { + cbd3(r, buf); +} + +void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]) { + cbd2(r, buf); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h new file mode 100644 index 0000000000..631821956c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd.h @@ -0,0 +1,11 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly.h" + +#define cbd_eta1 S2N_KYBER_512_R3_NAMESPACE(cbd_eta1) +void cbd_eta1(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]); + +#define cbd_eta2 S2N_KYBER_512_R3_NAMESPACE(cbd_eta2) +void cbd_eta2(poly *r, const uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c new file mode 100644 index 0000000000..a922bd220f --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.c @@ -0,0 +1,137 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_cbd_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +/************************************************* +* Name: cbd2 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array +**************************************************/ +static void cbd2(poly * restrict r, const __m256i buf[2*S2N_KYBER_512_R3_N/128]) +{ + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i mask55 = _mm256_set1_epi32(0x55555555); + const __m256i mask33 = _mm256_set1_epi32(0x33333333); + const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); + + for(i = 0; i < S2N_KYBER_512_R3_N/64; i++) { + f0 = _mm256_load_si256(&buf[i]); + + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); + + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); + + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); + + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); + + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1)); + + _mm256_store_si256(&r->vec[4*i+0], f0); + _mm256_store_si256(&r->vec[4*i+1], f2); + _mm256_store_si256(&r->vec[4*i+2], f1); + _mm256_store_si256(&r->vec[4*i+3], f3); + } +} + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3 +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array +**************************************************/ +static void cbd3(poly * restrict r, const uint8_t buf[3*S2N_KYBER_512_R3_N/4+8]) +{ + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i mask249 = _mm256_set1_epi32(0x249249); + const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB); + const __m256i mask07 = _mm256_set1_epi32(7); + const __m256i mask70 = _mm256_set1_epi32(7 << 16); + const __m256i mask3 = _mm256_set1_epi16(3); + const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4, + -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0); + + for(i = 0; i < S2N_KYBER_512_R3_N/32; i++) { + // correcting cast-align and cast-qual errors + // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]); + f0 = _mm256_loadu_si256((const void *)&buf[24*i]); + f0 = _mm256_permute4x64_epi64(f0,0x94); + f0 = _mm256_shuffle_epi8(f0,shufbidx); + + f1 = _mm256_srli_epi32(f0,1); + f2 = _mm256_srli_epi32(f0,2); + f0 = _mm256_and_si256(mask249,f0); + f1 = _mm256_and_si256(mask249,f1); + f2 = _mm256_and_si256(mask249,f2); + f0 = _mm256_add_epi32(f0,f1); + f0 = _mm256_add_epi32(f0,f2); + + f1 = _mm256_srli_epi32(f0,3); + f0 = _mm256_add_epi32(f0,mask6DB); + f0 = _mm256_sub_epi32(f0,f1); + + f1 = _mm256_slli_epi32(f0,10); + f2 = _mm256_srli_epi32(f0,12); + f3 = _mm256_srli_epi32(f0, 2); + f0 = _mm256_and_si256(f0,mask07); + f1 = _mm256_and_si256(f1,mask70); + f2 = _mm256_and_si256(f2,mask07); + f3 = _mm256_and_si256(f3,mask70); + f0 = _mm256_add_epi16(f0,f1); + f1 = _mm256_add_epi16(f2,f3); + f0 = _mm256_sub_epi16(f0,mask3); + f1 = _mm256_sub_epi16(f1,mask3); + + f2 = _mm256_unpacklo_epi32(f0,f1); + f3 = _mm256_unpackhi_epi32(f0,f1); + + f0 = _mm256_permute2x128_si256(f2,f3,0x20); + f1 = _mm256_permute2x128_si256(f2,f3,0x31); + + _mm256_store_si256(&r->vec[2*i+0], f0); + _mm256_store_si256(&r->vec[2*i+1], f1); + } +} + +/* buf 32 bytes longer for cbd3 */ +void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1]) +{ + // correcting cast-align and cast-qual errors + // old version: cbd3(r, (uint8_t *)buf); + cbd3(r, (const void *)buf); +} + +void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128]) +{ + cbd2(r, buf); +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h new file mode 100644 index 0000000000..972c71fbf5 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_cbd_avx2.h @@ -0,0 +1,15 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define poly_cbd_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta1_avx2) +void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1]); + +#define poly_cbd_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_cbd_eta2_avx2) +void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128]); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c new file mode 100644 index 0000000000..cdc0b817df --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.c @@ -0,0 +1,122 @@ +#include "kyber512r3_align_avx2.h" +#include "kyber512r3_consts_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#define Q S2N_KYBER_512_R3_Q +#define MONT -1044 // 2^16 mod q +#define QINV -3327 // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO -10079 // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI +#define MASK 4095 +#define SHIFT 32 + +const qdata_t qdata = {{ +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + +#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT +}}; +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h new file mode 100644 index 0000000000..1983ba44d6 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_consts_avx2.h @@ -0,0 +1,43 @@ +#pragma once + +#include "kyber512r3_params.h" + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found. + * + * This define helps us get around this + */ +#ifdef __ASSEMBLER__ +#if defined(__WIN32__) || defined(__APPLE__) +#define decorate(s) _##s +#define cdecl2(s) decorate(s) +#define cdecl(s) cdecl2(S2N_KYBER_512_R3_NAMESPACE(##s)) +#else +#define cdecl(s) S2N_KYBER_512_R3_NAMESPACE(##s) +#endif +#endif + + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#ifndef __ASSEMBLER__ +#include "kyber512r3_align_avx2.h" +typedef ALIGNED_INT16(640) qdata_t; +#define qdata S2N_KYBER_512_R3_NAMESPACE(qdata) +extern const qdata_t qdata; +#endif +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c index 8289a526b3..c5ce0c91f2 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.c +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.c @@ -7,7 +7,9 @@ #include <stddef.h> #include <stdint.h> -#include "fips202.h" + +#include "kyber512r3_params.h" +#include "kyber512r3_fips202.h" #define NROUNDS 24 #define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) @@ -24,7 +26,7 @@ static uint64_t load64(const uint8_t *x) { uint64_t r = 0; for (size_t i = 0; i < 8; ++i) { - r |= (uint64_t) x[i] << 8 * i; + r |= (uint64_t)x[i] << 8 * i; } return r; @@ -46,18 +48,19 @@ static void store64(uint8_t *x, uint64_t u) { /* Keccak round constants */ static const uint64_t KeccakF_RoundConstants[NROUNDS] = { - 0x0000000000000001ULL, 0x0000000000008082ULL, - 0x800000000000808aULL, 0x8000000080008000ULL, - 0x000000000000808bULL, 0x0000000080000001ULL, - 0x8000000080008081ULL, 0x8000000000008009ULL, - 0x000000000000008aULL, 0x0000000000000088ULL, - 0x0000000080008009ULL, 0x000000008000000aULL, - 0x000000008000808bULL, 0x800000000000008bULL, - 0x8000000000008089ULL, 0x8000000000008003ULL, - 0x8000000000008002ULL, 0x8000000000000080ULL, - 0x000000000000800aULL, 0x800000008000000aULL, - 0x8000000080008081ULL, 0x8000000000008080ULL, - 0x0000000080000001ULL, 0x8000000080008008ULL}; + 0x0000000000000001ULL, 0x0000000000008082ULL, + 0x800000000000808aULL, 0x8000000080008000ULL, + 0x000000000000808bULL, 0x0000000080000001ULL, + 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x000000000000008aULL, 0x0000000000000088ULL, + 0x0000000080008009ULL, 0x000000008000000aULL, + 0x000000008000808bULL, 0x800000000000008bULL, + 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, + 0x000000000000800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, + 0x0000000080000001ULL, 0x8000000080008008ULL, +}; /************************************************* * Name: KeccakF1600_StatePermute @@ -74,9 +77,8 @@ static void KeccakF1600_StatePermute(uint64_t *state) { uint64_t Aka, Ake, Aki, Ako, Aku; uint64_t Ama, Ame, Ami, Amo, Amu; uint64_t Asa, Ase, Asi, Aso, Asu; - uint64_t BCa, BCe, BCi, BCo, BCu; - // copyFromState(A, state) + /* copyFromState(A, state) */ Aba = state[0]; Abe = state[1]; Abi = state[2]; @@ -104,6 +106,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) { Asu = state[24]; for (round = 0; round < NROUNDS; round += 2) { + uint64_t BCa, BCe, BCi, BCo, BCu; uint64_t Da, De, Di, Do, Du; uint64_t Eba, Ebe, Ebi, Ebo, Ebu; uint64_t Ega, Ege, Egi, Ego, Egu; @@ -111,14 +114,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) { uint64_t Ema, Eme, Emi, Emo, Emu; uint64_t Esa, Ese, Esi, Eso, Esu; - // prepareTheta + /* prepareTheta */ BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa; BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase; BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi; BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso; BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - // thetaRhoPiChiIotaPrepareTheta(round , A, E) + /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */ Da = BCu ^ ROL(BCe, 1); De = BCa ^ ROL(BCi, 1); Di = BCe ^ ROL(BCo, 1); @@ -206,14 +209,14 @@ static void KeccakF1600_StatePermute(uint64_t *state) { Eso = BCo ^ ((~BCu) & BCa); Esu = BCu ^ ((~BCa) & BCe); - // prepareTheta + /* prepareTheta */ BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa; BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - // thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */ Da = BCu ^ ROL(BCe, 1); De = BCa ^ ROL(BCi, 1); Di = BCe ^ ROL(BCo, 1); @@ -302,7 +305,7 @@ static void KeccakF1600_StatePermute(uint64_t *state) { Asu = BCu ^ ((~BCa) & BCe); } - // copyToState(state, A) + /* copyToState(state, A) */ state[0] = Aba; state[1] = Abe; state[2] = Abi; @@ -400,6 +403,37 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32 } /************************************************* + * Name: shake128_absorb + * + * Description: Absorb step of the SHAKE128 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed + * into s + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen) { + keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE, input, inlen, 0x1F); +} + +/************************************************* + * Name: shake128_squeezeblocks + * + * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of + * SHAKE128_RATE bytes each. Modifies the state. Can be called + * multiple times to keep squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed + * (written to output) + * - shake128ctx *state: pointer to input/output Keccak state + **************************************************/ +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state) { + keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE128_RATE); +} + +/************************************************* * Name: shake256_absorb * * Description: Absorb step of the SHAKE256 XOF. @@ -410,8 +444,8 @@ static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, uint64_t *s, uint32 * into s * - size_t inlen: length of input in bytes **************************************************/ -static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t inlen) { - keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F); +void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen) { + keccak_absorb(state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE, input, inlen, 0x1F); } /************************************************* @@ -426,8 +460,8 @@ static void shake256_absorb(shake256_ctx *state, const uint8_t *input, size_t in * (written to output) * - shake256ctx *state: pointer to input/output Keccak state **************************************************/ -static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx *state) { - keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE); +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state) { + keccak_squeezeblocks(output, nblocks, state->ctx, S2N_KYBER_512_R3_SHAKE256_RATE); } /************************************************* @@ -441,15 +475,15 @@ static void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256_ctx * - size_t inlen: length of input in bytes **************************************************/ void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen) { - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[SHAKE256_RATE]; - shake256_ctx s; + size_t nblocks = outlen / S2N_KYBER_512_R3_SHAKE256_RATE; + uint8_t t[S2N_KYBER_512_R3_SHAKE256_RATE]; + shake256ctx s; shake256_absorb(&s, input, inlen); shake256_squeezeblocks(output, nblocks, &s); - output += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; + output += nblocks * S2N_KYBER_512_R3_SHAKE256_RATE; + outlen -= nblocks * S2N_KYBER_512_R3_SHAKE256_RATE; if (outlen) { shake256_squeezeblocks(t, 1, &s); @@ -459,3 +493,50 @@ void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen } } +/************************************************* + * Name: sha3_256 + * + * Description: SHA3-256 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[S2N_KYBER_512_R3_SHA3_256_RATE]; + + /* Absorb input */ + keccak_absorb(s, S2N_KYBER_512_R3_SHA3_256_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +/************************************************* + * Name: sha3_512 + * + * Description: SHA3-512 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[S2N_KYBER_512_R3_SHA3_512_RATE]; + + /* Absorb input */ + keccak_absorb(s, S2N_KYBER_512_R3_SHA3_512_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, S2N_KYBER_512_R3_SHA3_512_RATE); + + for (size_t i = 0; i < 64; i++) { + output[i] = t[i]; + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h new file mode 100644 index 0000000000..1f4f395f72 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202.h @@ -0,0 +1,68 @@ +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include "kyber512r3_params.h" + +#define S2N_KYBER_512_R3_SHAKE128_RATE 168 +#define S2N_KYBER_512_R3_SHAKE256_RATE 136 +#define S2N_KYBER_512_R3_SHA3_256_RATE 136 +#define S2N_KYBER_512_R3_SHA3_384_RATE 104 +#define S2N_KYBER_512_R3_SHA3_512_RATE 72 + +#define S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE 25 + +/* Context for non-incremental API */ +#define shake128ctx S2N_KYBER_512_R3_NAMESPACE(shake128ctx) +typedef struct { + uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE]; +} shake128ctx; + +/* Context for non-incremental API */ +#define shake256ctx S2N_KYBER_512_R3_NAMESPACE(shake256ctx) +typedef struct { + uint64_t ctx[S2N_KYBER_512_R3_PQC_SHAKECTX_SIZE]; +} shake256ctx; + +/* Initialize the state and absorb the provided input. + * + * This function does not support being called multiple times + * with the same state. + */ +#define shake128_absorb S2N_KYBER_512_R3_NAMESPACE(shake128_absorb) +void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen); +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +#define shake128_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128_squeezeblocks) +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state); + +/* Copy the state. */ +#define shake128_ctx_clone S2N_KYBER_512_R3_NAMESPACE(shake128_ctx_clone) +void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src); + +/* Initialize the state and absorb the provided input. + * + * This function does not support being called multiple times + * with the same state. + */ +#define shake256_absorb S2N_KYBER_512_R3_NAMESPACE(shake256_absorb) +void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen); +/* Squeeze output out of the sponge. + * + * Supports being called multiple times + */ +#define shake256_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256_squeezeblocks) +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state); + +/* One-stop SHAKE256 call */ +#define shake256 S2N_KYBER_512_R3_NAMESPACE(shake256) +void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen); + +#define sha3_256 S2N_KYBER_512_R3_NAMESPACE(sha3_256) +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen); + +/* One-stop SHA3-512 shop */ +#define sha3_512 S2N_KYBER_512_R3_NAMESPACE(sha3_512) +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c new file mode 100644 index 0000000000..5f07fb44a3 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.c @@ -0,0 +1,210 @@ +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include "kyber512r3_fips202.h" +#include "kyber512r3_fips202x4_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define KeccakF1600_StatePermute4x S2N_KYBER_512_R3_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) +extern void KeccakF1600_StatePermute4x(__m256i *s); + +/* Implementation is used from Crystal Kyber Repository + * See for more details: https://github.com/XKCP/XKCP */ + +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) +{ + size_t i; + uint64_t pos = 0; + __m256i t, idx; + + for(i = 0; i < 25; ++i) + s[i] = _mm256_setzero_si256(); + + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while(inlen >= r) { + for(i = 0; i < r/8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; + } + inlen -= r; + + KeccakF1600_StatePermute4x(s); + } + + for(i = 0; i < inlen/8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; + } + inlen -= 8*i; + + if(inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((1ULL << (8*inlen)) - 1); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); + } + + t = _mm256_set1_epi64x((uint64_t)p << 8*inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x(1ULL << 63); + s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t); +} + +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + unsigned int r, + __m256i s[25]) +{ + unsigned int i; + __m128d t; + + while(nblocks > 0) { + KeccakF1600_StatePermute4x(s); + for(i=0; i < r/8; ++i) { + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + // correcting cast-align errors + // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out0[8*i], t); + _mm_storel_pd((__attribute__((__may_alias__)) void *)&out0[8*i], t); + // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out1[8*i], t); + _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out1[8*i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); + // old version: _mm_storel_pd((__attribute__((__may_alias__)) double *)&out2[8*i], t); + _mm_storel_pd((__attribute__((__may_alias__)) void *)&out2[8*i], t); + // old version: _mm_storeh_pd((__attribute__((__may_alias__)) double *)&out3[8*i], t); + _mm_storeh_pd((__attribute__((__may_alias__)) void *)&out3[8*i], t); + } + + out0 += r; + out1 += r; + out2 += r; + out3 += r; + --nblocks; + } +} + +void shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) +{ + keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); +} + +void shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) +{ + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE128_RATE, state->s); +} + +void shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) +{ + keccakx4_absorb_once(state->s, S2N_KYBER_512_R3_SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); +} + +void shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) +{ + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, S2N_KYBER_512_R3_SHAKE256_RATE, state->s); +} + +void shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) +{ + unsigned int i; + size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE128_RATE; + uint8_t t[4][S2N_KYBER_512_R3_SHAKE128_RATE]; + keccakx4_state state; + + shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); + shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE; + out1 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE; + out2 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE; + out3 += nblocks*S2N_KYBER_512_R3_SHAKE128_RATE; + outlen -= nblocks*S2N_KYBER_512_R3_SHAKE128_RATE; + + if(outlen) { + shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for(i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } + } +} + +void shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) +{ + unsigned int i; + size_t nblocks = outlen/S2N_KYBER_512_R3_SHAKE256_RATE; + uint8_t t[4][S2N_KYBER_512_R3_SHAKE256_RATE]; + keccakx4_state state; + + shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); + shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE; + out1 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE; + out2 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE; + out3 += nblocks*S2N_KYBER_512_R3_SHAKE256_RATE; + outlen -= nblocks*S2N_KYBER_512_R3_SHAKE256_RATE; + + if(outlen) { + shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for(i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } + } +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h new file mode 100644 index 0000000000..8c4896724c --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fips202x4_avx2.h @@ -0,0 +1,70 @@ +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include "kyber512r3_params.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define keccakx4_state S2N_KYBER_512_R3_NAMESPACE(keccakx4_state) +typedef struct { + __m256i s[25]; +} keccakx4_state; + +#define shake128x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake128x4_absorb_once) +void shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +#define shake128x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake128x4_squeezeblocks) +void shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); + +#define shake256x4_absorb_once S2N_KYBER_512_R3_NAMESPACE(shake256x4_absorb_once) +void shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +#define shake256x4_squeezeblocks S2N_KYBER_512_R3_NAMESPACE(shake256x4_squeezeblocks) +void shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); + +#define shake128x4 S2N_KYBER_512_R3_NAMESPACE(shake128x4) +void shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +#define shake256x4 S2N_KYBER_512_R3_NAMESPACE(shake256x4) +void shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S new file mode 100644 index 0000000000..3492489a67 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_fq_avx2.S @@ -0,0 +1,122 @@ +#include "kyber512r3_consts_avx2.h" + +// The small macros (.inc files) are combined with .S files directly +/*****.include "fq.inc"*****/ +/***************************/ +.macro red16 r,rs=0,x=12 +vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else +vpsraw $10,%ymm\x,%ymm\x +.endif +vpmullw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro csubq r,x=12 +vpsubw %ymm0,%ymm\r,%ymm\r +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro caddq r,x=12 +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro fqmulprecomp al,ah,b,x=12 +vpmullw %ymm\al,%ymm\b,%ymm\x +vpmulhw %ymm\ah,%ymm\b,%ymm\b +vpmulhw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\b,%ymm\b +.endm +/***************************/ + +.text +reduce128_avx: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +.global cdecl(reduce_avx2_asm) +cdecl(reduce_avx2_asm): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret + +tomont128_avx: +#load +vmovdqa (%rdi),%ymm3 +vmovdqa 32(%rdi),%ymm4 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm6 +vmovdqa 128(%rdi),%ymm7 +vmovdqa 160(%rdi),%ymm8 +vmovdqa 192(%rdi),%ymm9 +vmovdqa 224(%rdi),%ymm10 + +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 + +#store +vmovdqa %ymm3,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm7,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm9,192(%rdi) +vmovdqa %ymm10,224(%rdi) + +ret + +.global cdecl(tomont_avx2_asm) +cdecl(tomont_avx2_asm): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c new file mode 100644 index 0000000000..ace1783448 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.c @@ -0,0 +1,323 @@ +#include <stddef.h> +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_indcpa.h" +#include "kyber512r3_poly.h" +#include "kyber512r3_polyvec.h" +#include "kyber512r3_fips202.h" +#include "kyber512r3_symmetric.h" +#include "pq-crypto/s2n_pq_random.h" +#include "utils/s2n_safety.h" + +/************************************************* +* Name: pack_pk +* +* Description: Serialize the public key as concatenation of the +* serialized vector of polynomials pk +* and the public seed used to generate the matrix A. +* +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec +* const uint8_t *seed: pointer to the input public seed +**************************************************/ +static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], polyvec *pk, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]) { + polyvec_tobytes(r, pk); + for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) { + r[i + S2N_KYBER_512_R3_POLYVECBYTES] = seed[i]; + } +} + +/************************************************* +* Name: unpack_pk +* +* Description: De-serialize public key from a byte array; +* approximate inverse of pack_pk +* +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A +* - const uint8_t *packedpk: pointer to input serialized public key +**************************************************/ +static void unpack_pk(polyvec *pk, uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES]) { + polyvec_frombytes(pk, packedpk); + for (size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) { + seed[i] = packedpk[i + S2N_KYBER_512_R3_POLYVECBYTES]; + } +} + +/************************************************* +* Name: pack_sk +* +* Description: Serialize the secret key +* +* Arguments: - uint8_t *r: pointer to output serialized secret key +* - polyvec *sk: pointer to input vector of polynomials (secret key) +**************************************************/ +static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk) { + polyvec_tobytes(r, sk); +} + +/************************************************* +* Name: unpack_sk +* +* Description: De-serialize the secret key; +* inverse of pack_sk +* +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) +* - const uint8_t *packedsk: pointer to input serialized secret key +**************************************************/ +static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) { + polyvec_frombytes(sk, packedsk); +} + +/************************************************* +* Name: pack_ciphertext +* +* Description: Serialize the ciphertext as concatenation of the +* compressed and serialized vector of polynomials b +* and the compressed and serialized polynomial v +* +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v +**************************************************/ +static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v) { + polyvec_compress(r, b); + poly_compress(r + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: unpack_ciphertext +* +* Description: De-serialize and decompress ciphertext from a byte array; +* approximate inverse of pack_ciphertext +* +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v +* - const uint8_t *c: pointer to the input serialized ciphertext +**************************************************/ +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES]) { + polyvec_decompress(b, c); + poly_decompress(v, c + S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES); +} + +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, unsigned int len, const uint8_t *buf, unsigned int buflen) { + unsigned int ctr, pos; + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + uint16_t val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + uint16_t val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; + + if (val0 < S2N_KYBER_512_R3_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < S2N_KYBER_512_R3_Q) { + r[ctr++] = val1; + } + } + + return ctr; +} + +/************************************************* +* Name: gen_matrix +* +* Description: Deterministically generate matrix A (or the transpose of A) +* from a seed. Entries of the matrix are polynomials that look +* uniformly random. Performs rejection sampling on output of +* a XOF +* +* Arguments: - polyvec *a: pointer to ouptput matrix A +* - const uint8_t *seed: pointer to input seed +* - int transposed: boolean deciding whether A or A^T +* is generated +**************************************************/ +#define XOF_BLOCKBYTES 168 +#define GEN_MATRIX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +static void gen_matrix(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed) { + unsigned int ctr, buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; + xof_state state; + + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + for (unsigned int j = 0; j < S2N_KYBER_512_R3_K; j++) { + if (transposed) { + kyber_shake128_absorb(&state, seed, i, j); + } else { + kyber_shake128_absorb(&state, seed, j, i); + } + + shake128_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, S2N_KYBER_512_R3_N, buf, buflen); + + while (ctr < S2N_KYBER_512_R3_N) { + off = buflen % 3; + for (unsigned int k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + shake128_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, S2N_KYBER_512_R3_N - ctr, buf, buflen); + } + } + } +} + +/************************************************* +* Name: indcpa_keypair +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - uint8_t *pk: pointer to output public key +* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key +* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes) +* +* Returns: 0 on success +* !0 on failure +**************************************************/ +int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) { + uint8_t buf[2 * S2N_KYBER_512_R3_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES; + uint8_t nonce = 0; + polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv; + + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES)); + sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES); + + gen_matrix(a, publicseed, 0); + + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); + } + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); + } + + polyvec_ntt(&skpv); + polyvec_ntt(&e); + + //* matrix-vector multiplication */ + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + poly_tomont(&pkpv.vec[i]); + } + + polyvec_add(&pkpv, &pkpv, &e); + polyvec_reduce(&pkpv); + + pack_sk(sk, &skpv); + pack_pk(pk, &pkpv, publicseed); + + return 0; +} + +/************************************************* +* Name: indcpa_enc +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length S2N_KYBER_512_R3_SYMBYTES) +* to deterministically generate all +* randomness +**************************************************/ +void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]) { + uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]; + uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed, pk); + poly_frommsg(&k, m); + gen_matrix(at, seed, 1); + + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_getnoise_eta1(sp.vec + i, coins, nonce++); + } + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_getnoise_eta2(ep.vec + i, coins, nonce++); + } + poly_getnoise_eta2(&epp, coins, nonce++); + + polyvec_ntt(&sp); + + /* matrix-vector multiplication */ + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + } + + polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + + polyvec_invntt_tomont(&bp); + poly_invntt_tomont(&v); + + polyvec_add(&bp, &bp, &ep); + poly_add(&v, &v, &epp); + poly_add(&v, &v, &k); + polyvec_reduce(&bp); + poly_reduce(&v); + + pack_ciphertext(c, &bp, &v); +} + +/************************************************* +* Name: indcpa_dec +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length S2N_KYBER_512_R3_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES) +**************************************************/ +void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) { + polyvec bp, skpv; + poly v, mp; + + unpack_ciphertext(&bp, &v, c); + unpack_sk(&skpv, sk); + + polyvec_ntt(&bp); + polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + poly_invntt_tomont(&mp); + + poly_sub(&mp, &v, &mp); + poly_reduce(&mp); + + poly_tomsg(m, &mp); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h new file mode 100644 index 0000000000..f8b9e401a0 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa.h @@ -0,0 +1,15 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" + +#define indcpa_keypair S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair) +int indcpa_keypair(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]); + +#define indcpa_enc S2N_KYBER_512_R3_NAMESPACE(indcpa_enc) +void indcpa_enc(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]); + +#define indcpa_dec S2N_KYBER_512_R3_NAMESPACE(indcpa_dec) +void indcpa_dec(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c new file mode 100644 index 0000000000..91e7513881 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.c @@ -0,0 +1,363 @@ +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include "kyber512r3_align_avx2.h" +#include "kyber512r3_params.h" +#include "kyber512r3_indcpa_avx2.h" +#include "kyber512r3_polyvec_avx2.h" +#include "kyber512r3_poly_avx2.h" +#include "kyber512r3_rejsample_avx2.h" +#include "kyber512r3_fips202.h" +#include "kyber512r3_fips202x4_avx2.h" +#include "pq-crypto/s2n_pq_random.h" +#include "utils/s2n_safety.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +/************************************************* +* Name: pack_pk +* +* Description: Serialize the public key as concatenation of the +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by polyvec_reduce_avx2(). +* +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec +* const uint8_t *seed: pointer to the input public seed +**************************************************/ +static void pack_pk(uint8_t r[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]) +{ + polyvec_tobytes_avx2(r, pk); + memcpy(r+S2N_KYBER_512_R3_POLYVECBYTES, seed, S2N_KYBER_512_R3_SYMBYTES); +} + +/************************************************* +* Name: unpack_pk +* +* Description: De-serialize public key from a byte array; +* approximate inverse of pack_pk +* +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A +* - const uint8_t *packedpk: pointer to input serialized public key +**************************************************/ +static void unpack_pk(polyvec *pk, + uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], + const uint8_t packedpk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES]) +{ + polyvec_frombytes_avx2(pk, packedpk); + memcpy(seed, packedpk+S2N_KYBER_512_R3_POLYVECBYTES, S2N_KYBER_512_R3_SYMBYTES); +} + +/************************************************* +* Name: pack_sk +* +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by polyvec_reduce_avx2(). +* +* Arguments: - uint8_t *r: pointer to output serialized secret key +* - polyvec *sk: pointer to input vector of polynomials (secret key) +**************************************************/ +static void pack_sk(uint8_t r[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES], polyvec *sk) +{ + polyvec_tobytes_avx2(r, sk); +} + +/************************************************* +* Name: unpack_sk +* +* Description: De-serialize the secret key; inverse of pack_sk +* +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* - const uint8_t *packedsk: pointer to input serialized secret key +**************************************************/ +static void unpack_sk(polyvec *sk, const uint8_t packedsk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) +{ + polyvec_frombytes_avx2(sk, packedsk); +} + +/************************************************* +* Name: pack_ciphertext +* +* Description: Serialize the ciphertext as concatenation of the +* compressed and serialized vector of polynomials b +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by polyvec_reduce_avx2() and poly_reduce_avx2(), respectively. +* +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v +**************************************************/ +static void pack_ciphertext(uint8_t r[S2N_KYBER_512_R3_INDCPA_BYTES], polyvec *b, poly *v) +{ + polyvec_compress_avx2(r, b); + poly_compress_avx2(r+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: unpack_ciphertext +* +* Description: De-serialize and decompress ciphertext from a byte array; +* approximate inverse of pack_ciphertext +* +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v +* - const uint8_t *c: pointer to the input serialized ciphertext +**************************************************/ +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES]) +{ + polyvec_decompress_avx2(b, c); + poly_decompress_avx2(v, c+S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES); +} + +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + + ctr = pos = 0; + while(ctr < len && pos <= buflen - 3) { // buflen is always at least 3 + val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF; + val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF; + pos += 3; + + if(val0 < S2N_KYBER_512_R3_Q) + r[ctr++] = val0; + if(ctr < len && val1 < S2N_KYBER_512_R3_Q) + r[ctr++] = val1; + } + + return ctr; +} + +#define gen_a(A,B) gen_matrix_avx2(A,B,0) +#define gen_at(A,B) gen_matrix_avx2(A,B,1) + +/************************************************* +* Name: gen_matrix_avx2 +* +* Description: Deterministically generate matrix A (or the transpose of A) +* from a seed. Entries of the matrix are polynomials that look +* uniformly random. Performs rejection sampling on output of +* a XOF +* +* Arguments: - polyvec *a: pointer to ouptput matrix A +* - const uint8_t *seed: pointer to input seed +* - int transposed: boolean deciding whether A or A^T is generated +**************************************************/ +void gen_matrix_avx2(polyvec *a, const uint8_t seed[32], int transposed) +{ + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_SHAKE128_RATE) buf[4]; + __m256i f; + keccakx4_state state; + + // correcting cast-align and cast-qual errors + // old version: f = _mm256_loadu_si256((__m256i *)seed); + f = _mm256_loadu_si256((const void *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + if(transposed) { + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 0; + buf[1].coeffs[33] = 1; + buf[2].coeffs[32] = 1; + buf[2].coeffs[33] = 0; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 1; + } + else { + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 1; + buf[1].coeffs[33] = 0; + buf[2].coeffs[32] = 0; + buf[2].coeffs[33] = 1; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 1; + } + + shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); + shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS, &state); + + ctr0 = rej_uniform_avx2(a[0].vec[0].coeffs, buf[0].coeffs); + ctr1 = rej_uniform_avx2(a[0].vec[1].coeffs, buf[1].coeffs); + ctr2 = rej_uniform_avx2(a[1].vec[0].coeffs, buf[2].coeffs); + ctr3 = rej_uniform_avx2(a[1].vec[1].coeffs, buf[3].coeffs); + + while(ctr0 < S2N_KYBER_512_R3_N || ctr1 < S2N_KYBER_512_R3_N || ctr2 < S2N_KYBER_512_R3_N || ctr3 < S2N_KYBER_512_R3_N) { + shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); + + ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, S2N_KYBER_512_R3_N - ctr0, buf[0].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE); + ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, S2N_KYBER_512_R3_N - ctr1, buf[1].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE); + ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, S2N_KYBER_512_R3_N - ctr2, buf[2].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE); + ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, S2N_KYBER_512_R3_N - ctr3, buf[3].coeffs, S2N_KYBER_512_R3_SHAKE128_RATE); + } + + poly_nttunpack_avx2(&a[0].vec[0]); + poly_nttunpack_avx2(&a[0].vec[1]); + poly_nttunpack_avx2(&a[1].vec[0]); + poly_nttunpack_avx2(&a[1].vec[1]); +} + +/************************************************* +* Name: indcpa_keypair_avx2 +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - uint8_t *pk: pointer to output public key +* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES bytes) +**************************************************/ +int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], + uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) +{ + unsigned int i; + uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + S2N_KYBER_512_R3_SYMBYTES; + polyvec a[S2N_KYBER_512_R3_K], e, pkpv, skpv; + + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES)); + sha3_512(buf, buf, S2N_KYBER_512_R3_SYMBYTES); + + gen_a(a, publicseed); + + poly_getnoise_eta1_4x(skpv.vec+0, skpv.vec+1, e.vec+0, e.vec+1, noiseseed, 0, 1, 2, 3); + + polyvec_ntt_avx2(&skpv); + polyvec_reduce_avx2(&skpv); + polyvec_ntt_avx2(&e); + + // matrix-vector multiplication + for(i=0;i<S2N_KYBER_512_R3_K;i++) { + polyvec_basemul_acc_montgomery_avx2(&pkpv.vec[i], &a[i], &skpv); + poly_tomont_avx2(&pkpv.vec[i]); + } + + polyvec_add_avx2(&pkpv, &pkpv, &e); + polyvec_reduce_avx2(&pkpv); + + pack_sk(sk, &skpv); + pack_pk(pk, &pkpv, publicseed); + + return 0; +} + +/************************************************* +* Name: indcpa_enc_avx2 +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length S2N_KYBER_512_R3_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length S2N_KYBER_512_R3_SYMBYTES) to deterministically +* generate all randomness +**************************************************/ +void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]) +{ + unsigned int i; + uint8_t seed[S2N_KYBER_512_R3_SYMBYTES]; + polyvec sp, pkpv, ep, at[S2N_KYBER_512_R3_K], b; + poly v, k, epp; + + unpack_pk(&pkpv, seed, pk); + poly_frommsg_avx2(&k, m); + gen_at(at, seed); + + poly_getnoise_eta1122_4x(sp.vec+0, sp.vec+1, ep.vec+0, ep.vec+1, coins, 0, 1, 2, 3); + poly_getnoise_eta2_avx2(&epp, coins, 4); + + polyvec_ntt_avx2(&sp); + + // matrix-vector multiplication + for(i=0;i<S2N_KYBER_512_R3_K;i++) + polyvec_basemul_acc_montgomery_avx2(&b.vec[i], &at[i], &sp); + polyvec_basemul_acc_montgomery_avx2(&v, &pkpv, &sp); + + polyvec_invntt_tomont_avx2(&b); + poly_invntt_tomont_avx2(&v); + + polyvec_add_avx2(&b, &b, &ep); + poly_add_avx2(&v, &v, &epp); + poly_add_avx2(&v, &v, &k); + polyvec_reduce_avx2(&b); + poly_reduce_avx2(&v); + + pack_ciphertext(c, &b, &v); +} + +/************************************************* +* Name: indcpa_dec_avx2 +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length S2N_KYBER_512_R3_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length S2N_KYBER_512_R3_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES) +**************************************************/ +void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]) +{ + polyvec b, skpv; + poly v, mp; + + unpack_ciphertext(&b, &v, c); + unpack_sk(&skpv, sk); + + polyvec_ntt_avx2(&b); + polyvec_basemul_acc_montgomery_avx2(&mp, &skpv, &b); + poly_invntt_tomont_avx2(&mp); + + poly_sub_avx2(&mp, &v, &mp); + poly_reduce_avx2(&mp); + + poly_tomsg_avx2(m, &mp); +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h new file mode 100644 index 0000000000..127e5bc4f6 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_indcpa_avx2.h @@ -0,0 +1,25 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_polyvec_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#define gen_matrix_avx2 S2N_KYBER_512_R3_NAMESPACE(gen_matrix_avx2) +void gen_matrix_avx2(polyvec *a, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], int transposed); + +#define indcpa_keypair_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_keypair_avx2) +int indcpa_keypair_avx2(uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], + uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]); + +#define indcpa_enc_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_enc_avx2) +void indcpa_enc_avx2(uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t pk[S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[S2N_KYBER_512_R3_SYMBYTES]); + +#define indcpa_dec_avx2 S2N_KYBER_512_R3_NAMESPACE(indcpa_dec_avx2) +void indcpa_dec_avx2(uint8_t m[S2N_KYBER_512_R3_INDCPA_MSGBYTES], + const uint8_t c[S2N_KYBER_512_R3_INDCPA_BYTES], + const uint8_t sk[S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES]); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S new file mode 100644 index 0000000000..8f131668ff --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_invntt_avx2.S @@ -0,0 +1,255 @@ +#include "kyber512r3_consts_avx2.h" + +// The small macros (.inc files) are combined with .S files directly +/*****.include "shuffle.inc"*****/ +/********************************/ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm +/********************************/ + +/*****.include "fq.inc"*****/ +/***************************/ +.macro red16 r,rs=0,x=12 +vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else +vpsraw $10,%ymm\x,%ymm\x +.endif +vpmullw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro csubq r,x=12 +vpsubw %ymm0,%ymm\r,%ymm\r +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro caddq r,x=12 +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro fqmulprecomp al,ah,b,x=12 +vpmullw %ymm\al,%ymm\b,%ymm\x +vpmulhw %ymm\ah,%ymm\b,%ymm\b +vpmulhw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\b,%ymm\b +.endm +/***************************/ + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + +vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 + +vpmullw %ymm\zl0,%ymm13,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 + +vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 +vpmullw %ymm\zl1,%ymm15,%ymm\rh3 + +vpmulhw %ymm\zh0,%ymm12,%ymm12 +vpmulhw %ymm\zh0,%ymm13,%ymm13 + +vpmulhw %ymm\zh1,%ymm14,%ymm14 +vpmulhw %ymm\zh1,%ymm15,%ymm15 + +vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + +vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + +vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + +vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + +vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + +vpsubw %ymm\rh2,%ymm14,%ymm\rh2 +vpsubw %ymm\rh3,%ymm15,%ymm\rh3 +.endm + +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 + +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 + +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 + +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 + +vmovdqa _16XV*2(%rsi),%ymm1 +red16 3 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 + +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 + +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 + +red16 9 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +/* level 5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 + +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 + +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm + +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 + +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 + +.if \off == 0 +red16 4 +.endif + +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.text +.global cdecl(invntt_avx2_asm) +cdecl(invntt_avx2_asm): +vmovdqa _16XQ*2(%rsi),%ymm0 + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 +ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c new file mode 100644 index 0000000000..9d6c49b9c4 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_kem.c @@ -0,0 +1,158 @@ +#include <stddef.h> +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_symmetric.h" +#include "kyber512r3_indcpa.h" +#include "kyber512r3_indcpa_avx2.h" +#include "tls/s2n_kem.h" +#include "utils/s2n_safety.h" +#include "pq-crypto/s2n_pq_random.h" +#include "pq-crypto/s2n_pq.h" + +/************************************************* +* Name: crypto_kem_keypair +* +* Description: Generates public and private key +* for CCA-secure Kyber key encapsulation mechanism +* +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int s2n_kyber_512_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); +#if defined(S2N_KYBER512R3_AVX2_BMI2) + if (s2n_kyber512r3_is_avx2_bmi2_enabled()) { + POSIX_GUARD(indcpa_keypair_avx2(pk, sk)); + }else +#endif + { + POSIX_GUARD(indcpa_keypair(pk, sk)); + } + + for(size_t i = 0; i < S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES] = pk[i]; + } + sha3_256(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-2*S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES); + /* Value z for pseudo-random output on reject */ + POSIX_GUARD_RESULT(s2n_get_random_bytes(sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES, S2N_KYBER_512_R3_SYMBYTES)); + return S2N_SUCCESS; +} + +/************************************************* +* Name: crypto_kem_enc +* +* Description: Generates cipher text and shared +* secret for given public key +* +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of S2N_KYBER_512_R3_PUBLIC_KEY_BYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int s2n_kyber_512_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES]; + + POSIX_GUARD_RESULT(s2n_get_random_bytes(buf, S2N_KYBER_512_R3_SYMBYTES)); + /* Don't release system RNG output */ + sha3_256(buf, buf, S2N_KYBER_512_R3_SYMBYTES); + + /* Multitarget countermeasure for coins + contributory KEM */ + sha3_256(buf+S2N_KYBER_512_R3_SYMBYTES, pk, S2N_KYBER_512_R3_PUBLIC_KEY_BYTES); + sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES); + + /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */ +#if defined(S2N_KYBER512R3_AVX2_BMI2) + if (s2n_kyber512r3_is_avx2_bmi2_enabled()) { + indcpa_enc_avx2(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES); + }else +#endif + { + indcpa_enc(ct, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES); + } + + /* overwrite coins in kr with H(c) */ + sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES); + /* hash concatenation of pre-k and H(c) to k */ + shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES); + return S2N_SUCCESS; +} + +/************************************************* +* Name: crypto_kem_dec +* +* Description: Generates shared secret for given +* cipher text and private key +* +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of S2N_KYBER_512_R3_SHARED_SECRET_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of S2N_KYBER_512_R3_CIPHERTEXT_BYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of S2N_KYBER_512_R3_SECRET_KEY_BYTES bytes) +* +* Returns 0. +* +* On failure, ss will contain a pseudo-random value. +**************************************************/ +int s2n_kyber_512_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + uint8_t buf[2*S2N_KYBER_512_R3_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2*S2N_KYBER_512_R3_SYMBYTES]; + uint8_t cmp[S2N_KYBER_512_R3_CIPHERTEXT_BYTES]; + const uint8_t *pk = sk+S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES; + +#if defined(S2N_KYBER512R3_AVX2_BMI2) + if (s2n_kyber512r3_is_avx2_bmi2_enabled()) { + indcpa_dec_avx2(buf, ct, sk); + }else +#endif + { + indcpa_dec(buf, ct, sk); + } + + /* Multitarget countermeasure for coins + contributory KEM */ + for(size_t i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) { + buf[S2N_KYBER_512_R3_SYMBYTES + i] = sk[S2N_KYBER_512_R3_SECRET_KEY_BYTES - 2 * S2N_KYBER_512_R3_SYMBYTES + i]; + } + sha3_512(kr, buf, 2*S2N_KYBER_512_R3_SYMBYTES); + + /* coins are in kr+S2N_KYBER_512_R3_SYMBYTES */ +#if defined(S2N_KYBER512R3_AVX2_BMI2) + if (s2n_kyber512r3_is_avx2_bmi2_enabled()) { + indcpa_enc_avx2(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES); + }else +#endif + { + indcpa_enc(cmp, buf, pk, kr+S2N_KYBER_512_R3_SYMBYTES); + } + + /* If ct and cmp are equal (dont_copy = 1), decryption has succeeded and we do NOT overwrite pre-k below. + * If ct and cmp are not equal (dont_copy = 0), decryption fails and we do overwrite pre-k. */ + int dont_copy = s2n_constant_time_equals(ct, cmp, S2N_KYBER_512_R3_CIPHERTEXT_BYTES); + + /* overwrite coins in kr with H(c) */ + sha3_256(kr+S2N_KYBER_512_R3_SYMBYTES, ct, S2N_KYBER_512_R3_CIPHERTEXT_BYTES); + + /* Overwrite pre-k with z on re-encryption failure */ + POSIX_GUARD(s2n_constant_time_copy_or_dont(kr, sk+S2N_KYBER_512_R3_SECRET_KEY_BYTES-S2N_KYBER_512_R3_SYMBYTES, + S2N_KYBER_512_R3_SYMBYTES, dont_copy)); + + /* hash concatenation of pre-k and H(c) to k */ + shake256(ss, S2N_KYBER_512_R3_SSBYTES, kr, 2*S2N_KYBER_512_R3_SYMBYTES); + return S2N_SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c new file mode 100644 index 0000000000..6c82105c19 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.c @@ -0,0 +1,122 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_ntt.h" +#include "kyber512r3_reduce.h" + +const int16_t zetas[128] = { + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 +}; + +const int16_t zetas_inv[128] = { + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 +}; + +/************************************************* +* Name: fqmul +* +* Description: Multiplication followed by Montgomery reduction +* +* Arguments: - int16_t a: first factor +* - int16_t b: second factor +* +* Returns 16-bit integer congruent to a*b*R^{-1} mod q +**************************************************/ +static int16_t fqmul(int16_t a, int16_t b) { + return montgomery_reduce((int32_t)a * b); +} + +/************************************************* +* Name: ntt +* +* Description: Inplace number-theoretic transform (NTT) in Rq +* input is in standard order, output is in bitreversed order +* +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq +**************************************************/ +void ntt(int16_t r[256]) { + unsigned int len, start, j, k; + int16_t t, zeta; + + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { + zeta = zetas[k++]; + for (j = start; j < start + len; ++j) { + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; + } + } + } +} + +/************************************************* +* Name: invntt_tomont +* +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order +* +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq +**************************************************/ +void invntt(int16_t r[256]) { + unsigned int start, len, j, k; + int16_t t, zeta; + + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { + zeta = zetas_inv[k++]; + for (j = start; j < start + len; ++j) { + t = r[j]; + r[j] = barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); + } + } + } + + for (j = 0; j < 256; ++j) { + r[j] = fqmul(r[j], zetas_inv[127]); + } +} + +/************************************************* +* Name: basemul +* +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) +* used for multiplication of elements in Rq in NTT domain +* +* Arguments: - int16_t r[2]: pointer to the output polynomial +* - const int16_t a[2]: pointer to the first factor +* - const int16_t b[2]: pointer to the second factor +* - int16_t zeta: integer defining the reduction polynomial +**************************************************/ +void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { + r[0] = fqmul(a[1], b[1]); + r[0] = fqmul(r[0], zeta); + r[0] += fqmul(a[0], b[0]); + + r[1] = fqmul(a[0], b[1]); + r[1] += fqmul(a[1], b[0]); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h new file mode 100644 index 0000000000..98d6235764 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt.h @@ -0,0 +1,19 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" + +#define zetas S2N_KYBER_512_R3_NAMESPACE(zetas) +extern const int16_t zetas[128]; + +#define zetas_inv S2N_KYBER_512_R3_NAMESPACE(zetas_inv) +extern const int16_t zetas_inv[128]; + +#define ntt S2N_KYBER_512_R3_NAMESPACE(ntt) +void ntt(int16_t poly[256]); + +#define invntt S2N_KYBER_512_R3_NAMESPACE(invntt) +void invntt(int16_t poly[256]); + +#define basemul S2N_KYBER_512_R3_NAMESPACE(basemul) +void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S new file mode 100644 index 0000000000..dc80086cb1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.S @@ -0,0 +1,218 @@ +#include "kyber512r3_consts_avx2.h" + +// The small macros (.inc files) are combined with .S files directly +/*****.include "shuffle.inc"*****/ +/********************************/ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm +/********************************/ + +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 +.endm + +.macro reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 + +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 +.endm + +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 + +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln +vpaddw %ymm12,%ymm\rh0,%ymm\rh0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + +vpaddw %ymm13,%ymm\rh1,%ymm\rh1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 +vpaddw %ymm14,%ymm\rh2,%ymm\rh2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 +vpaddw %ymm15,%ymm\rh3,%ymm\rh3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm + +.text +.global cdecl(ntt_avx2_asm) +cdecl(ntt_avx2_asm): +vmovdqa _16XQ*2(%rsi),%ymm0 + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + +ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h new file mode 100644 index 0000000000..3616132358 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_ntt_avx2.h @@ -0,0 +1,28 @@ +#pragma once + +#include <stdint.h> + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define ntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntt_avx2_asm) +void ntt_avx2_asm(__m256i *r, const __m256i *qdata); + +#define invntt_avx2_asm S2N_KYBER_512_R3_NAMESPACE(invntt_avx2_asm) +void invntt_avx2_asm(__m256i *r, const __m256i *qdata); + +#define nttunpack_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttunpack_avx2_asm) +void nttunpack_avx2_asm(__m256i *r, const __m256i *qdata); + +#define basemul_avx2_asm S2N_KYBER_512_R3_NAMESPACE(basemul_avx2_asm) +void basemul_avx2_asm(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *qdata); + +#define ntttobytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(ntttobytes_avx2_asm) +void ntttobytes_avx2_asm(uint8_t *r, const __m256i *a, const __m256i *qdata); + +#define nttfrombytes_avx2_asm S2N_KYBER_512_R3_NAMESPACE(nttfrombytes_avx2_asm) +void nttfrombytes_avx2_asm(__m256i *r, const uint8_t *a, const __m256i *qdata); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h new file mode 100644 index 0000000000..d2d32d08f1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_params.h @@ -0,0 +1,31 @@ +#pragma once + +/* All kyber512r3 functions and global variables in the pq-crypto/kyber_r3 directory + * should be defined using the namespace macro to avoid symbol collisions. For example, + * in foo.h, declare a function as follows: + * + * #define foo_function S2N_KYBER_512_R3_NAMESPACE(foo_function) + * int foo_function(int foo_argument); */ +#define S2N_KYBER_512_R3_NAMESPACE(s) s2n_kyber_512_r3_##s + +#define S2N_KYBER_512_R3_K 2 + +#define S2N_KYBER_512_R3_N 256 +#define S2N_KYBER_512_R3_Q 3329 + +#define S2N_KYBER_512_R3_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define S2N_KYBER_512_R3_SSBYTES 32 /* size in bytes of shared key */ + +#define S2N_KYBER_512_R3_POLYBYTES 384 +#define S2N_KYBER_512_R3_POLYVECBYTES (S2N_KYBER_512_R3_K * S2N_KYBER_512_R3_POLYBYTES) + +#define S2N_KYBER_512_R3_ETA1 3 +#define S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES 128 +#define S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES 640 + +#define S2N_KYBER_512_R3_ETA2 2 + +#define S2N_KYBER_512_R3_INDCPA_MSGBYTES S2N_KYBER_512_R3_SYMBYTES +#define S2N_KYBER_512_R3_INDCPA_PUBLICKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES + S2N_KYBER_512_R3_SYMBYTES) +#define S2N_KYBER_512_R3_INDCPA_SECRETKEYBYTES (S2N_KYBER_512_R3_POLYVECBYTES) +#define S2N_KYBER_512_R3_INDCPA_BYTES (S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES + S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES) diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c new file mode 100644 index 0000000000..76ae60a583 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.c @@ -0,0 +1,300 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly.h" +#include "kyber512r3_ntt.h" +#include "kyber512r3_reduce.h" +#include "kyber512r3_cbd.h" +#include "kyber512r3_symmetric.h" + +/************************************************* +* Name: poly_compress +* +* Description: Compression and subsequent serialization of a polynomial +* +* Arguments: - uint8_t *r: pointer to output byte array +* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial +**************************************************/ +void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i, j; + uint8_t t[8]; + + poly_csubq(a); + + for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 15; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]) { + unsigned int i; + + for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * S2N_KYBER_512_R3_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * S2N_KYBER_512_R3_Q) + 8) >> 4; + a += 1; + } +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYBYTES bytes) +* - poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a) { + unsigned int i; + + poly_csubq(a); + + for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) { + uint16_t t0 = a->coeffs[2 * i]; + uint16_t t1 = a->coeffs[2 * i + 1]; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of S2N_KYBER_512_R3_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; + } +} + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]) { + unsigned int i, j; + int16_t mask; + + for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((S2N_KYBER_512_R3_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *a) { + unsigned int i, j; + uint16_t t; + + poly_csubq(a); + + for (i = 0; i < S2N_KYBER_512_R3_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q) & 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: poly_getnoise_eta1 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter S2N_KYBER_512_R3_ETA1 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length S2N_KYBER_512_R3_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) { + uint8_t buf[S2N_KYBER_512_R3_ETA1 * S2N_KYBER_512_R3_N / 4]; + shake256_prf(buf, sizeof(buf), seed, nonce); + cbd_eta1(r, buf); +} + +/************************************************* +* Name: poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter S2N_KYBER_512_R3_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length S2N_KYBER_512_R3_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) { + uint8_t buf[S2N_KYBER_512_R3_ETA2 * S2N_KYBER_512_R3_N / 4]; + shake256_prf(buf, sizeof(buf), seed, nonce); + cbd_eta2(r, buf); +} + + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) { + ntt(r->coeffs); + poly_reduce(r); +} + +/************************************************* +* Name: poly_invntt_tomont +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt_tomont(poly *r) { + invntt(r->coeffs); +} + +/************************************************* +* Name: poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N / 4; i++) { + basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], zetas[64 + i]); + basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -zetas[64 + i]); + } +} + +/************************************************* +* Name: poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_tomont(poly *r) { + unsigned int i; + const int16_t f = (1ULL << 32) % S2N_KYBER_512_R3_Q; + for (i = 0; i < S2N_KYBER_512_R3_N; i++) { + r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i] * f); + } +} + +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N; i++) { + r->coeffs[i] = barrett_reduce(r->coeffs[i]); + } +} + +/************************************************* +* Name: poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_csubq(poly *r) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N; i++) { + r->coeffs[i] = csubq(r->coeffs[i]); + } +} + +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N; i++) { + r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + } +} + +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) { + unsigned int i; + for (i = 0; i < S2N_KYBER_512_R3_N; i++) { + r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h new file mode 100644 index 0000000000..da43766e51 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly.h @@ -0,0 +1,61 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +#define poly S2N_KYBER_512_R3_NAMESPACE(poly) +typedef struct { + int16_t coeffs[S2N_KYBER_512_R3_N]; +} poly; + +#define poly_compress S2N_KYBER_512_R3_NAMESPACE(poly_compress) +void poly_compress(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], poly *a); + +#define poly_decompress S2N_KYBER_512_R3_NAMESPACE(poly_decompress) +void poly_decompress(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]); + +#define poly_tobytes S2N_KYBER_512_R3_NAMESPACE(poly_tobytes) +void poly_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], poly *a); + +#define poly_frombytes S2N_KYBER_512_R3_NAMESPACE(poly_frombytes) +void poly_frombytes(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]); + +#define poly_frommsg S2N_KYBER_512_R3_NAMESPACE(poly_frommsg) +void poly_frommsg(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]); + +#define poly_tomsg S2N_KYBER_512_R3_NAMESPACE(poly_tomsg) +void poly_tomsg(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], poly *r); + +#define poly_getnoise_eta1 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1) +void poly_getnoise_eta1(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce); + +#define poly_getnoise_eta2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2) +void poly_getnoise_eta2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce); + +#define poly_ntt S2N_KYBER_512_R3_NAMESPACE(poly_ntt) +void poly_ntt(poly *r); + +#define poly_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont) +void poly_invntt_tomont(poly *r); + +#define poly_basemul_montgomery S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery) +void poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +#define poly_tomont S2N_KYBER_512_R3_NAMESPACE(poly_tomont) +void poly_tomont(poly *r); + +#define poly_reduce S2N_KYBER_512_R3_NAMESPACE(poly_reduce) +void poly_reduce(poly *r); + +#define poly_csubq S2N_KYBER_512_R3_NAMESPACE(poly_csubq) +void poly_csubq(poly *r); + +#define poly_add S2N_KYBER_512_R3_NAMESPACE(poly_add) +void poly_add(poly *r, const poly *a, const poly *b); + +#define poly_sub S2N_KYBER_512_R3_NAMESPACE(poly_sub) +void poly_sub(poly *r, const poly *a, const poly *b); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c new file mode 100644 index 0000000000..aa961ff403 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.c @@ -0,0 +1,453 @@ +#include <stdint.h> +#include <string.h> +#include "kyber512r3_align_avx2.h" +#include "kyber512r3_consts_avx2.h" +#include "kyber512r3_poly_avx2.h" +#include "kyber512r3_ntt_avx2.h" +#include "kyber512r3_reduce_avx2.h" +#include "kyber512r3_cbd_avx2.h" +#include "kyber512r3_fips202.h" +#include "kyber512r3_fips202x4_avx2.h" +#include "kyber512r3_symmetric.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +/************************************************* +* Name: poly_compress_avx2 +* +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by poly_reduce_avx2(). +* +* Arguments: - uint8_t *r: pointer to output byte array +* (of length S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_compress_avx2(uint8_t r[128], const poly * restrict a) +{ + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 9); + const __m256i mask = _mm256_set1_epi16(15); + const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1); + const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0); + + for(i=0;i<S2N_KYBER_512_R3_N/64;i++) { + f0 = _mm256_load_si256(&a->vec[4*i+0]); + f1 = _mm256_load_si256(&a->vec[4*i+1]); + f2 = _mm256_load_si256(&a->vec[4*i+2]); + f3 = _mm256_load_si256(&a->vec[4*i+3]); + f0 = _mm256_mulhi_epi16(f0,v); + f1 = _mm256_mulhi_epi16(f1,v); + f2 = _mm256_mulhi_epi16(f2,v); + f3 = _mm256_mulhi_epi16(f3,v); + f0 = _mm256_mulhrs_epi16(f0,shift1); + f1 = _mm256_mulhrs_epi16(f1,shift1); + f2 = _mm256_mulhrs_epi16(f2,shift1); + f3 = _mm256_mulhrs_epi16(f3,shift1); + f0 = _mm256_and_si256(f0,mask); + f1 = _mm256_and_si256(f1,mask); + f2 = _mm256_and_si256(f2,mask); + f3 = _mm256_and_si256(f3,mask); + f0 = _mm256_packus_epi16(f0,f1); + f2 = _mm256_packus_epi16(f2,f3); + f0 = _mm256_maddubs_epi16(f0,shift2); + f2 = _mm256_maddubs_epi16(f2,shift2); + f0 = _mm256_packus_epi16(f0,f2); + f0 = _mm256_permutevar8x32_epi32(f0,permdidx); + // correcting cast-align error + // old version: _mm256_storeu_si256((__m256i *)&r[32*i],f0); + _mm256_storeu_si256((void *)&r[32*i],f0); + } +} + +void poly_decompress_avx2(poly * restrict r, const uint8_t a[128]) +{ + unsigned int i; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]); + const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4, + 3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0); + const __m256i mask = _mm256_set1_epi32(0x00F0000F); + const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048); + + for(i=0;i<S2N_KYBER_512_R3_N/16;i++) { + // correcting cast-align and cast-qual errors + // old version: t = _mm_loadl_epi64((__m128i *)&a[8*i]); + t = _mm_loadl_epi64((const void *)&a[8*i]); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f,shufbidx); + f = _mm256_and_si256(f,mask); + f = _mm256_mullo_epi16(f,shift); + f = _mm256_mulhrs_epi16(f,q); + _mm256_store_si256(&r->vec[i],f); + } +} + +/************************************************* +* Name: poly_tobytes_avx2 +* +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by poly_reduce_avx2(). The coefficients are orderd as output by +* poly_ntt_avx2(); the serialized output coefficients are in bitreversed +* order. +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYBYTES bytes) +* - poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a) +{ + ntttobytes_avx2_asm(r, a->vec, qdata.vec); +} + +/************************************************* +* Name: poly_frombytes_avx2 +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes_avx2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of S2N_KYBER_512_R3_POLYBYTES bytes) +**************************************************/ +void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]) +{ + nttfrombytes_avx2_asm(r->vec, a, qdata.vec); +} + +/************************************************* +* Name: poly_frommsg_avx2 +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void poly_frommsg_avx2(poly * restrict r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]) +{ + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0)); + const __m256i hqs = _mm256_set1_epi16((S2N_KYBER_512_R3_Q+1)/2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*i); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256(&r->vec[0+2*i+0],g0); \ + _mm256_store_si256(&r->vec[0+2*i+1],g1); \ + _mm256_store_si256(&r->vec[8+2*i+0],g2); \ + _mm256_store_si256(&r->vec[8+2*i+1],g3) + + // correcting cast-align and cast-qual errors + // old version: f = _mm256_loadu_si256((__m256i *)msg); + f = _mm256_loadu_si256((const void *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); +} + +/************************************************* +* Name: poly_tomsg_avx2 +* +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by poly_reduce_avx2(). +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly * restrict a) +{ + unsigned int i; + uint32_t small; + __m256i f0, f1, g0, g1; + const __m256i hq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/2); + const __m256i hhq = _mm256_set1_epi16((S2N_KYBER_512_R3_Q - 1)/4); + + for(i=0;i<S2N_KYBER_512_R3_N/32;i++) { + f0 = _mm256_load_si256(&a->vec[2*i+0]); + f1 = _mm256_load_si256(&a->vec[2*i+1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); + f0 = _mm256_packs_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + small = _mm256_movemask_epi8(f0); + memcpy(&msg[4*i], &small, 4); + } +} + +/************************************************* +* Name: poly_getnoise_eta1_avx2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter S2N_KYBER_512_R3_ETA1 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length S2N_KYBER_512_R3_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) +{ + ALIGNED_UINT8(S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1_avx2 + shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4, seed, nonce); + poly_cbd_eta1_avx2(r, buf.vec); +} + +/************************************************* +* Name: poly_getnoise_eta2_avx2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter S2N_KYBER_512_R3_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length S2N_KYBER_512_R3_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce) +{ + ALIGNED_UINT8(S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4) buf; + shake256_prf(buf.coeffs, S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/4, seed, nonce); + poly_cbd_eta2_avx2(r, buf.vec); +} + +#define NOISE_NBLOCKS ((S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/4+S2N_KYBER_512_R3_SHAKE256_RATE-1)/S2N_KYBER_512_R3_SHAKE256_RATE) +void poly_getnoise_eta1_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t seed[32], + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3) +{ + ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4]; + __m256i f; + keccakx4_state state; + + // correcting cast-align and cast-qual errors + // old version: f = _mm256_loadu_si256((__m256i *)seed); + f = _mm256_loadu_si256((const void *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; + + shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); + + poly_cbd_eta1_avx2(r0, buf[0].vec); + poly_cbd_eta1_avx2(r1, buf[1].vec); + poly_cbd_eta1_avx2(r2, buf[2].vec); + poly_cbd_eta1_avx2(r3, buf[3].vec); +} + +void poly_getnoise_eta1122_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t seed[32], + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3) +{ + ALIGNED_UINT8(NOISE_NBLOCKS*S2N_KYBER_512_R3_SHAKE256_RATE) buf[4]; + __m256i f; + keccakx4_state state; + + // correcting cast-align and cast-qual errors + // old version: f = _mm256_loadu_si256((__m256i *)seed); + f = _mm256_loadu_si256((const void *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; + + shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); + + poly_cbd_eta1_avx2(r0, buf[0].vec); + poly_cbd_eta1_avx2(r1, buf[1].vec); + poly_cbd_eta2_avx2(r2, buf[2].vec); + poly_cbd_eta2_avx2(r3, buf[3].vec); +} + +/************************************************* +* Name: poly_ntt_avx2 +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. +* +* Arguments: - poly *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt_avx2(poly *r) +{ + ntt_avx2_asm(r->vec, qdata.vec); +} + +/************************************************* +* Name: poly_invntt_tomont_avx2 +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. +* +* Arguments: - poly *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt_tomont_avx2(poly *r) +{ + invntt_avx2_asm(r->vec, qdata.vec); +} + +void poly_nttunpack_avx2(poly *r) +{ + nttunpack_avx2_asm(r->vec, qdata.vec); +} + +/************************************************* +* Name: poly_basemul_montgomery_avx2 +* +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b) +{ + basemul_avx2_asm(r->vec, a->vec, b->vec, qdata.vec); +} + +/************************************************* +* Name: poly_tomont_avx2 +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_tomont_avx2(poly *r) +{ + tomont_avx2_asm(r->vec, qdata.vec); +} + +/************************************************* +* Name: poly_reduce_avx2 +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce_avx2(poly *r) +{ + reduce_avx2_asm(r->vec, qdata.vec); +} + +/************************************************* +* Name: poly_add_avx2 +* +* Description: Add two polynomials. No modular reduction +* is performed. +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add_avx2(poly *r, const poly *a, const poly *b) +{ + unsigned int i; + __m256i f0, f1; + + for(i=0;i<S2N_KYBER_512_R3_N/16;i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256(&r->vec[i], f0); + } +} + +/************************************************* +* Name: poly_sub_avx2 +* +* Description: Subtract two polynomials. No modular reduction +* is performed. +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub_avx2(poly *r, const poly *a, const poly *b) +{ + unsigned int i; + __m256i f0, f1; + + for(i=0;i<S2N_KYBER_512_R3_N/16;i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256(&r->vec[i], f0); + } +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h new file mode 100644 index 0000000000..bd6e857f79 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_poly_avx2.h @@ -0,0 +1,80 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_align_avx2.h" +#include "kyber512r3_params.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#define poly S2N_KYBER_512_R3_NAMESPACE(poly) +typedef ALIGNED_INT16(S2N_KYBER_512_R3_N) poly; + +#define poly_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_compress_avx2) +void poly_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES], const poly *a); + +#define poly_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_decompress_avx2) +void poly_decompress_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYCOMPRESSEDBYTES]); + +#define poly_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tobytes_avx2) +void poly_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYBYTES], const poly *a); + +#define poly_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frombytes_avx2) +void poly_frombytes_avx2(poly *r, const uint8_t a[S2N_KYBER_512_R3_POLYBYTES]); + +#define poly_frommsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_frommsg_avx2) +void poly_frommsg_avx2(poly *r, const uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES]); + +#define poly_tomsg_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomsg_avx2) +void poly_tomsg_avx2(uint8_t msg[S2N_KYBER_512_R3_INDCPA_MSGBYTES], const poly *r); + +#define poly_getnoise_eta1_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1_avx2) +void poly_getnoise_eta1_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce); + +#define poly_getnoise_eta2_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_avx2) +void poly_getnoise_eta2_avx2(poly *r, const uint8_t seed[S2N_KYBER_512_R3_SYMBYTES], uint8_t nonce); + +#define poly_getnoise_eta1_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta2_4x) +void poly_getnoise_eta1_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t seed[32], + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3); + +#define poly_getnoise_eta1122_4x S2N_KYBER_512_R3_NAMESPACE(poly_getnoise_eta1122_4x) +void poly_getnoise_eta1122_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t seed[32], + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3); + +#define poly_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_ntt_avx2) +void poly_ntt_avx2(poly *r); + +#define poly_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_invntt_tomont_avx2) +void poly_invntt_tomont_avx2(poly *r); + +#define poly_nttunpack_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_nttunpack_avx2) +void poly_nttunpack_avx2(poly *r); + +#define poly_basemul_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_basemul_montgomery_avx2) +void poly_basemul_montgomery_avx2(poly *r, const poly *a, const poly *b); + +#define poly_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_tomont_avx2) +void poly_tomont_avx2(poly *r); + +#define poly_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_reduce_avx2) +void poly_reduce_avx2(poly *r); + +#define poly_add_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_add_avx2) +void poly_add_avx2(poly *r, const poly *a, const poly *b); + +#define poly_sub_avx2 S2N_KYBER_512_R3_NAMESPACE(poly_sub_avx2) +void poly_sub_avx2(poly *r, const poly *a, const poly *b); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c new file mode 100644 index 0000000000..0a84cd092a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.c @@ -0,0 +1,186 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly.h" +#include "kyber512r3_polyvec.h" + +/************************************************* +* Name: polyvec_compress +* +* Description: Compress and serialize vector of polynomials +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a) { + polyvec_csubq(a); + + uint16_t t[4]; + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + for (unsigned int j = 0; j < S2N_KYBER_512_R3_N / 4; j++) { + for (unsigned int k = 0; k < 4; k++) + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + S2N_KYBER_512_R3_Q / 2) + / S2N_KYBER_512_R3_Q) & 0x3ff; + + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; + } + } +} + +/************************************************* +* Name: polyvec_decompress +* +* Description: De-serialize and decompress vector of polynomials; +* approximate inverse of polyvec_compress +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const uint8_t *a: pointer to input byte array +* (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES) +**************************************************/ +void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]) { + uint16_t t[4]; + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + for (unsigned int j = 0; j < S2N_KYBER_512_R3_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (unsigned int k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * S2N_KYBER_512_R3_Q + 512) >> 10; + } + } + } +} + +/************************************************* +* Name: polyvec_tobytes +* +* Description: Serialize vector of polynomials +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_tobytes(r + i * S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]); + } +} + +/************************************************* +* Name: polyvec_frombytes +* +* Description: De-serialize vector of polynomials; +* inverse of polyvec_tobytes +* +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length S2N_KYBER_512_R3_POLYVECBYTES) +**************************************************/ +void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_frombytes(&r->vec[i], a + i * S2N_KYBER_512_R3_POLYBYTES); + } +} + +/************************************************* +* Name: polyvec_ntt +* +* Description: Apply forward NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_ntt(polyvec *r) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_ntt(&r->vec[i]); + } +} + +/************************************************* +* Name: polyvec_invntt_tomont +* +* Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_invntt_tomont(polyvec *r) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_invntt_tomont(&r->vec[i]); + } +} + +/************************************************* +* Name: polyvec_pointwise_acc_montgomery +* +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. +* +* Arguments: - poly *r: pointer to output polynomial +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + poly t; + + poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (unsigned int i = 1; i < S2N_KYBER_512_R3_K; i++) { + poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); + poly_add(r, r, &t); + } + + poly_reduce(r); +} + +/************************************************* +* Name: polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void polyvec_reduce(polyvec *r) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_reduce(&r->vec[i]); + } +} + +/************************************************* +* Name: polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void polyvec_csubq(polyvec *r) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_csubq(&r->vec[i]); + } +} + +/************************************************* +* Name: polyvec_add +* +* Description: Add vectors of polynomials +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { + for (unsigned int i = 0; i < S2N_KYBER_512_R3_K; i++) { + poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h new file mode 100644 index 0000000000..797f3c0d31 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec.h @@ -0,0 +1,40 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly.h" + +#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec) +typedef struct { + poly vec[S2N_KYBER_512_R3_K]; +} polyvec; + +#define polyvec_compress S2N_KYBER_512_R3_NAMESPACE(polyvec_compress) +void polyvec_compress(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES], polyvec *a); + +#define polyvec_decompress S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress) +void polyvec_decompress(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES]); + +#define polyvec_tobytes S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes) +void polyvec_tobytes(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], polyvec *a); + +#define polyvec_frombytes S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes) +void polyvec_frombytes(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]); + +#define polyvec_ntt S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt) +void polyvec_ntt(polyvec *r); + +#define polyvec_invntt_tomont S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont) +void polyvec_invntt_tomont(polyvec *r); + +#define polyvec_pointwise_acc_montgomery S2N_KYBER_512_R3_NAMESPACE(polyvec_pointwise_acc_montgomery) +void polyvec_pointwise_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); + +#define polyvec_reduce S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce) +void polyvec_reduce(polyvec *r); + +#define polyvec_csubq S2N_KYBER_512_R3_NAMESPACE(polyvec_csubq) +void polyvec_csubq(polyvec *r); + +#define polyvec_add S2N_KYBER_512_R3_NAMESPACE(polyvec_add) +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c new file mode 100644 index 0000000000..8434b96d76 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.c @@ -0,0 +1,227 @@ +#include <stdint.h> +#include <string.h> +#include "kyber512r3_polyvec_avx2.h" +#include "kyber512r3_poly_avx2.h" +#include "kyber512r3_consts_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +static void poly_compress10(uint8_t r[320], const poly * restrict a) +{ + unsigned int i; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]); + const __m256i v8 = _mm256_slli_epi16(v,3); + const __m256i off = _mm256_set1_epi16(15); + const __m256i shift1 = _mm256_set1_epi16(1 << 12); + const __m256i mask = _mm256_set1_epi16(1023); + const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9, + -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0); + + for(i=0;i<S2N_KYBER_512_R3_N/16;i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0,v8); + f2 = _mm256_add_epi16(f0,off); + f0 = _mm256_slli_epi16(f0,3); + f0 = _mm256_mulhi_epi16(f0,v); + f2 = _mm256_sub_epi16(f1,f2); + f1 = _mm256_andnot_si256(f1,f2); + f1 = _mm256_srli_epi16(f1,15); + f0 = _mm256_sub_epi16(f0,f1); + f0 = _mm256_mulhrs_epi16(f0,shift1); + f0 = _mm256_and_si256(f0,mask); + f0 = _mm256_madd_epi16(f0,shift2); + f0 = _mm256_sllv_epi32(f0,sllvdidx); + f0 = _mm256_srli_epi64(f0,12); + f0 = _mm256_shuffle_epi8(f0,shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0,1); + t0 = _mm_blend_epi16(t0,t1,0xE0); + // correcting cast-align error + // old version: _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0); + _mm_storeu_si128((void *)&r[20*i+ 0],t0); + memcpy(&r[20*i+16],&t1,4); + } +} + +static void poly_decompress10(poly * restrict r, const uint8_t a[320+12]) +{ + unsigned int i; + __m256i f; + const __m256i q = _mm256_set1_epi32((S2N_KYBER_512_R3_Q << 16) + 4*S2N_KYBER_512_R3_Q); + const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7, + 6, 5, 5, 4, 4, 3, 3, 2, + 9, 8, 8, 7, 7, 6, 6, 5, + 4, 3, 3, 2, 2, 1, 1, 0); + const __m256i sllvdidx = _mm256_set1_epi64x(4); + const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184); + + for(i=0;i<S2N_KYBER_512_R3_N/16;i++) { + // correcting cast-align and cast-qual errors + // old version: f = _mm256_loadu_si256((__m256i *)&a[20*i]); + f = _mm256_loadu_si256((const void *)&a[20*i]); + f = _mm256_permute4x64_epi64(f,0x94); + f = _mm256_shuffle_epi8(f,shufbidx); + f = _mm256_sllv_epi32(f,sllvdidx); + f = _mm256_srli_epi16(f,1); + f = _mm256_and_si256(f,mask); + f = _mm256_mulhrs_epi16(f,q); + _mm256_store_si256(&r->vec[i],f); + } +} + +/************************************************* +* Name: polyvec_compress_avx2 +* +* Description: Compress and serialize vector of polynomials +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a) +{ + unsigned int i; + + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_compress10(&r[320*i],&a->vec[i]); +} + +/************************************************* +* Name: polyvec_decompress_avx2 +* +* Description: De-serialize and decompress vector of polynomials; +* approximate inverse of polyvec_compress_avx2 +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const uint8_t *a: pointer to input byte array +* (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES) +**************************************************/ +void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12]) +{ + unsigned int i; + + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_decompress10(&r->vec[i],&a[320*i]); +} + +/************************************************* +* Name: polyvec_tobytes_avx2 +* +* Description: Serialize vector of polynomials +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for S2N_KYBER_512_R3_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_tobytes_avx2(r+i*S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]); +} + +/************************************************* +* Name: polyvec_frombytes_avx2 +* +* Description: De-serialize vector of polynomials; +* inverse of polyvec_tobytes_avx2 +* +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length S2N_KYBER_512_R3_POLYVECBYTES) +**************************************************/ +void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_frombytes_avx2(&r->vec[i], a+i*S2N_KYBER_512_R3_POLYBYTES); +} + +/************************************************* +* Name: polyvec_ntt_avx2 +* +* Description: Apply forward NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_ntt_avx2(polyvec *r) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_ntt_avx2(&r->vec[i]); +} + +/************************************************* +* Name: polyvec_invntt_tomont_avx2 +* +* Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_invntt_tomont_avx2(polyvec *r) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_invntt_tomont_avx2(&r->vec[i]); +} + +/************************************************* +* Name: polyvec_basemul_acc_montgomery_avx2 +* +* Description: Multiply elements in a and b in NTT domain, accumulate into r, +* and multiply by 2^-16. +* +* Arguments: - poly *r: pointer to output polynomial +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b) +{ + unsigned int i; + poly tmp; + + poly_basemul_montgomery_avx2(r,&a->vec[0],&b->vec[0]); + for(i=1;i<S2N_KYBER_512_R3_K;i++) { + poly_basemul_montgomery_avx2(&tmp,&a->vec[i],&b->vec[i]); + poly_add_avx2(r,r,&tmp); + } +} + +/************************************************* +* Name: polyvec_reduce_avx2 +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials; +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - polyvec *r: pointer to input/output polynomial +**************************************************/ +void polyvec_reduce_avx2(polyvec *r) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_reduce_avx2(&r->vec[i]); +} + +/************************************************* +* Name: polyvec_add_avx2 +* +* Description: Add vectors of polynomials +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b) +{ + unsigned int i; + for(i=0;i<S2N_KYBER_512_R3_K;i++) + poly_add_avx2(&r->vec[i], &a->vec[i], &b->vec[i]); +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h new file mode 100644 index 0000000000..536e1b23d0 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_polyvec_avx2.h @@ -0,0 +1,39 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_poly_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#define polyvec S2N_KYBER_512_R3_NAMESPACE(polyvec) +typedef struct{ + poly vec[S2N_KYBER_512_R3_K]; +} polyvec; + +#define polyvec_compress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_compress_avx2) +void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a); + +#define polyvec_decompress_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_decompress_avx2) +void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12]); + +#define polyvec_tobytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_tobytes_avx2) +void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a); + +#define polyvec_frombytes_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_frombytes_avx2) +void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES]); + +#define polyvec_ntt_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_ntt_avx2) +void polyvec_ntt_avx2(polyvec *r); + +#define polyvec_invntt_tomont_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_invntt_tomont_avx2) +void polyvec_invntt_tomont_avx2(polyvec *r); + +#define polyvec_basemul_acc_montgomery_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_basemul_acc_montgomery_avx2) +void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b); + +#define polyvec_reduce_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_reduce_avx2) +void polyvec_reduce_avx2(polyvec *r); + +#define polyvec_add_avx2 S2N_KYBER_512_R3_NAMESPACE(polyvec_add_avx2) +void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c new file mode 100644 index 0000000000..6219ad7e88 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.c @@ -0,0 +1,60 @@ +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_reduce.h" + +/************************************************* +* Name: montgomery_reduce +* +* Description: Montgomery reduction; given a 32-bit integer a, computes +* 16-bit integer congruent to a * R^-1 mod q, +* where R=2^16 +* +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} +* +* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. +**************************************************/ +int16_t montgomery_reduce(int32_t a) { + int32_t t; + int16_t u; + + u = a * S2N_KYBER_512_R3_QINV; + t = (int32_t)u * S2N_KYBER_512_R3_Q; + t = a - t; + t >>= 16; + return t; +} + +/************************************************* +* Name: barrett_reduce +* +* Description: Barrett reduction; given a 16-bit integer a, computes +* 16-bit integer congruent to a mod q in {0,...,q} +* +* Arguments: - int16_t a: input integer to be reduced +* +* Returns: integer in {0,...,q} congruent to a modulo q. +**************************************************/ +int16_t barrett_reduce(int16_t a) { + int16_t t; + const int16_t v = ((1U << 26) + S2N_KYBER_512_R3_Q / 2) / S2N_KYBER_512_R3_Q; + + t = (int32_t)v * a >> 26; + t *= S2N_KYBER_512_R3_Q; + return a - t; +} + +/************************************************* +* Name: csubq +* +* Description: Conditionallly subtract q +* +* Arguments: - int16_t x: input integer +* +* Returns: a - q if a >= q, else a +**************************************************/ +int16_t csubq(int16_t a) { + a -= S2N_KYBER_512_R3_Q; + a += (a >> 15) & S2N_KYBER_512_R3_Q; + return a; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h new file mode 100644 index 0000000000..bab9fa54f9 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce.h @@ -0,0 +1,15 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" + +#define S2N_KYBER_512_R3_QINV 62209 /* q^-1 mod 2^16 */ + +#define montgomery_reduce S2N_KYBER_512_R3_NAMESPACE(montgomery_reduce) +int16_t montgomery_reduce(int32_t a); + +#define barrett_reduce S2N_KYBER_512_R3_NAMESPACE(barrett_reduce) +int16_t barrett_reduce(int16_t a); + +#define csubq S2N_KYBER_512_R3_NAMESPACE(csubq) +int16_t csubq(int16_t x); diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h new file mode 100644 index 0000000000..24f0ede4e0 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_reduce_avx2.h @@ -0,0 +1,13 @@ +#pragma once + +#include "kyber512r3_params.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +#define reduce_avx2_asm S2N_KYBER_512_R3_NAMESPACE(reduce_avx2_asm) +void reduce_avx2_asm(__m256i *r, const __m256i *qdata); + +#define tomont_avx2_asm S2N_KYBER_512_R3_NAMESPACE(tomont_avx2_asm) +void tomont_avx2_asm(__m256i *r, const __m256i *qdata); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c new file mode 100644 index 0000000000..1461e0b9b1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.c @@ -0,0 +1,420 @@ +#include <stdint.h> +#include <string.h> +#include "kyber512r3_params.h" +#include "kyber512r3_consts_avx2.h" +#include "kyber512r3_rejsample_avx2.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#include <immintrin.h> + +//#define BMI + +#ifndef BMI +static const uint8_t idx[256][8] = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} +}; +#endif + +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) + +unsigned int rej_uniform_avx2(int16_t * restrict r, const uint8_t *buf) +{ + unsigned int ctr, pos; + uint16_t val0, val1; + uint32_t good; +#ifdef BMI + uint64_t idx0, idx1, idx2, idx3; +#endif + const __m256i bound = _mm256_load_si256(&qdata.vec[_16XQ/16]); + const __m256i ones = _mm256_set1_epi8(1); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15,14,14,13,12,11,11,10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11,10,10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; + + ctr = pos = 0; + while(ctr <= S2N_KYBER_512_R3_N - 32 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 48) { + // correcting cast-align and cast-qual errors + // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f0 = _mm256_loadu_si256((const void *)&buf[pos]); + // old version: f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]); + f1 = _mm256_loadu_si256((const void *)&buf[pos+24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; + + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); + + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); + +#ifdef BMI + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); + + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); +#else + // correcting cast-align and cast-qual errors + // old version: g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 0) & 0xFF])); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 0) & 0xFF])); + // old version: g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 8) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 8) & 0xFF])); + // old version: g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((const void *)&idx[(good >> 16) & 0xFF]), 1); + // old version: g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((const void *)&idx[(good >> 24) & 0xFF]), 1); +#endif + + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); + + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); + + // correcting cast-align errors + // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); + } + + while(ctr <= S2N_KYBER_512_R3_N - 8 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 12) { + // correcting cast-align and cast-qual errors + // old version: f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_loadu_si128((const void *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + +#ifdef BMI + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); +#else + good = _pext_u32(good, 0x5555); + // correcting cast-align and cast-qual errors + // old version: pilo = _mm_loadl_epi64((__m128i *)&idx[good]); + pilo = _mm_loadl_epi64((const void *)&idx[good]); +#endif + + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); + pilo = _mm_unpacklo_epi8(pilo, pihi); + f = _mm_shuffle_epi8(f, pilo); + // correcting cast-align error + // old version: _mm_storeu_si128((__m128i *)&r[ctr], f); + _mm_storeu_si128((void *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); + } + + while(ctr < S2N_KYBER_512_R3_N && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF; + val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)); + pos += 3; + + if(val0 < S2N_KYBER_512_R3_Q) + r[ctr++] = val0; + if(val1 < S2N_KYBER_512_R3_Q && ctr < S2N_KYBER_512_R3_N) + r[ctr++] = val1; + } + + return ctr; +} +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h new file mode 100644 index 0000000000..bd8a970464 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_rejsample_avx2.h @@ -0,0 +1,14 @@ +#pragma once + +#include <stdint.h> +#include "kyber512r3_params.h" +#include "kyber512r3_fips202.h" + +#if defined(S2N_KYBER512R3_AVX2_BMI2) +#define S2N_KYBER_512_R3_XOF_BLOCKBYTES S2N_KYBER_512_R3_SHAKE128_RATE +#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS ((12*S2N_KYBER_512_R3_N/8*(1 << 12)/S2N_KYBER_512_R3_Q + S2N_KYBER_512_R3_XOF_BLOCKBYTES)/S2N_KYBER_512_R3_XOF_BLOCKBYTES) +#define S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN (S2N_KYBER_512_R3_REJ_UNIFORM_AVX_NBLOCKS*S2N_KYBER_512_R3_XOF_BLOCKBYTES) + +#define rej_uniform_avx2 S2N_KYBER_512_R3_NAMESPACE(rej_uniform_avx2) +unsigned int rej_uniform_avx2(int16_t *r, const uint8_t *buf); +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S new file mode 100644 index 0000000000..ce7200e5ca --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_shuffle_avx2.S @@ -0,0 +1,272 @@ +#include "kyber512r3_consts_avx2.h" + +// The small macros (.inc files) are combined with .S files directly +/*****.include "fq.inc"*****/ +/***************************/ +.macro red16 r,rs=0,x=12 +vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else +vpsraw $10,%ymm\x,%ymm\x +.endif +vpmullw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro csubq r,x=12 +vpsubw %ymm0,%ymm\r,%ymm\r +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro caddq r,x=12 +vpsraw $15,%ymm\r,%ymm\x +vpand %ymm0,%ymm\x,%ymm\x +vpaddw %ymm\x,%ymm\r,%ymm\r +.endm + +.macro fqmulprecomp al,ah,b,x=12 +vpmullw %ymm\al,%ymm\b,%ymm\x +vpmulhw %ymm\ah,%ymm\b,%ymm\b +vpmulhw %ymm0,%ymm\x,%ymm\x +vpsubw %ymm\x,%ymm\b,%ymm\b +.endm +/***************************/ + +/*****.include "shuffle.inc"*****/ +/********************************/ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm +/********************************/ + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(nttunpack_avx2_asm) +cdecl(nttunpack_avx2_asm): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#csubq +csubq 5,13 +csubq 6,13 +csubq 7,13 +csubq 8,13 +csubq 9,13 +csubq 10,13 +csubq 11,13 +csubq 12,13 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +.global cdecl(ntttobytes_avx2_asm) +cdecl(ntttobytes_avx2_asm): +#consts +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +.global cdecl(nttfrombytes_avx2_asm) +cdecl(nttfrombytes_avx2_asm): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c new file mode 100644 index 0000000000..390a2a4e38 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric-shake.c @@ -0,0 +1,49 @@ +#include "kyber512r3_params.h" +#include "kyber512r3_fips202.h" +#include "kyber512r3_symmetric.h" +#include <stdlib.h> + +/************************************************* +* Name: kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. + +* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state +* - const uint8_t *input: pointer to S2N_KYBER_512_R3_SYMBYTES input to be absorbed into s +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { + size_t i; + uint8_t extseed[S2N_KYBER_512_R3_SYMBYTES + 2]; + + for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) { + extseed[i] = input[i]; + } + extseed[i++] = x; + extseed[i] = y; + shake128_absorb(s, extseed, S2N_KYBER_512_R3_SYMBYTES + 2); +} + +/************************************************* +* Name: shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *output: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t * key: pointer to the key (of length S2N_KYBER_512_R3_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { + uint8_t extkey[S2N_KYBER_512_R3_SYMBYTES + 1]; + size_t i; + + for (i = 0; i < S2N_KYBER_512_R3_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(output, outlen, extkey, S2N_KYBER_512_R3_SYMBYTES + 1); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h new file mode 100644 index 0000000000..e898a29450 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/kyber_r3/kyber512r3_symmetric.h @@ -0,0 +1,17 @@ +#pragma once + +#include "kyber512r3_params.h" +#include "kyber512r3_fips202.h" +#include <stdint.h> + +#define keccak_state S2N_KYBER_512_R3_NAMESPACE(keccak_state) +typedef shake128ctx keccak_state; + +#define xof_state S2N_KYBER_512_R3_NAMESPACE(xof_state) +typedef keccak_state xof_state; + +#define kyber_shake128_absorb S2N_KYBER_512_R3_NAMESPACE(kyber_shake128_absorb) +void kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); + +#define shake256_prf S2N_KYBER_512_R3_NAMESPACE(shake256_prf) +void shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c index 7381deed4e..8eda65be59 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c +++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.c @@ -14,13 +14,23 @@ */ #include "s2n_pq.h" +#include "crypto/s2n_openssl.h" -static bool sikep434r2_asm_enabled = false; +static bool sikep434r3_asm_enabled = false; + +/* BIKE Round-3 code supports several levels of optimization */ +static bool bike_r3_avx2_enabled = false; +static bool bike_r3_avx512_enabled = false; +static bool bike_r3_pclmul_enabled = false; +static bool bike_r3_vpclmul_enabled = false; + +static bool kyber512r3_avx2_bmi2_enabled = false; #if defined(S2N_CPUID_AVAILABLE) /* https://en.wikipedia.org/wiki/CPUID */ #include <cpuid.h> +#define PROCESSOR_INFO_AND_FEATURES 1 #define EXTENDED_FEATURES_LEAF 7 #define EXTENDED_FEATURES_SUBLEAF_ZERO 0 @@ -35,6 +45,12 @@ static bool sikep434r2_asm_enabled = false; #define bit_BMI2 (1 << 8) #endif +/* BIKE related CPU features */ +#define EBX_BIT_AVX2 (1 << 5) +#define EBX_BIT_AVX512 (1 << 16) +#define ECX_BIT_VPCLMUL (1 << 10) +#define ECX_BIT_PCLMUL (1 << 1) + bool s2n_get_cpuid_count(uint32_t leaf, uint32_t sub_leaf, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { /* 0x80000000 probes for extended cpuid info */ uint32_t max_level = __get_cpuid_max(leaf & 0x80000000, 0); @@ -67,56 +83,228 @@ bool s2n_cpu_supports_adx() { return (ebx & bit_ADX); } -bool s2n_cpu_supports_sikep434r2_asm() { -#if defined(S2N_SIKEP434R2_ASM) - /* The sikep434r2 assembly code always requires BMI2. If the assembly +bool s2n_cpu_supports_avx2() { + uint32_t eax, ebx, ecx, edx; + if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) { + return false; + } + + return (ebx & EBX_BIT_AVX2); +} + +bool s2n_cpu_supports_sikep434r3_asm() { +#if defined(S2N_SIKE_P434_R3_ASM) + /* The sikep434r3 assembly code always requires BMI2. If the assembly * was compiled with support for ADX, we also require ADX at runtime. */ - #if defined(S2N_ADX) - return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx(); - #else - return s2n_cpu_supports_bmi2(); - #endif +#if defined(S2N_ADX) + return s2n_cpu_supports_bmi2() && s2n_cpu_supports_adx(); +#else + return s2n_cpu_supports_bmi2(); +#endif +#else + /* sikep434r3 assembly was not supported at compile time */ + return false; +#endif /* defined(S2N_SIKE_P434_R3_ASM) */ +} + +bool s2n_cpu_supports_bike_r3_avx2() { +#if defined(S2N_BIKE_R3_AVX2) + uint32_t eax, ebx, ecx, edx; + if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) { + return false; + } + return ((ebx & EBX_BIT_AVX2) != 0); +#else + return false; +#endif +} + +bool s2n_cpu_supports_bike_r3_avx512() { +#if defined(S2N_BIKE_R3_AVX512) + uint32_t eax, ebx, ecx, edx; + if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) { + return false; + } + return ((ebx & EBX_BIT_AVX512) != 0); +#else + return false; +#endif +} + +bool s2n_cpu_supports_bike_r3_pclmul() { +#if defined(S2N_BIKE_R3_PCLMUL) + uint32_t eax, ebx, ecx, edx; + if (!s2n_get_cpuid_count(PROCESSOR_INFO_AND_FEATURES, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) { + return false; + } + return ((ecx & ECX_BIT_PCLMUL) != 0); #else - /* sikep434r2 assembly was not supported at compile time */ return false; -#endif /* defined(S2N_SIKEP434R2_ASM) */ +#endif +} + +bool s2n_cpu_supports_bike_r3_vpclmul() { +#if defined(S2N_BIKE_R3_AVX512) + uint32_t eax, ebx, ecx, edx; + if (!s2n_get_cpuid_count(EXTENDED_FEATURES_LEAF, EXTENDED_FEATURES_SUBLEAF_ZERO, &eax, &ebx, &ecx, &edx)) { + return false; + } + return ((ecx & ECX_BIT_VPCLMUL) != 0); +#else + return false; +#endif +} + +bool s2n_cpu_supports_kyber512r3_avx2_bmi2() { +#if defined(S2N_KYBER512R3_AVX2_BMI2) + return s2n_cpu_supports_bmi2() && s2n_cpu_supports_avx2(); +#else + return false; +#endif } #else /* defined(S2N_CPUID_AVAILABLE) */ /* If CPUID is not available, we cannot perform necessary run-time checks. */ -bool s2n_cpu_supports_sikep434r2_asm() { +bool s2n_cpu_supports_sikep434r3_asm() { + return false; +} + +bool s2n_cpu_supports_bike_r3_avx2() { + return false; +} + +bool s2n_cpu_supports_bike_r3_avx512() { + return false; +} + +bool s2n_cpu_supports_bike_r3_pclmul() { + return false; +} + +bool s2n_cpu_supports_bike_r3_vpclmul() { + return false; +} + +bool s2n_cpu_supports_kyber512r3_avx2_bmi2() { return false; } #endif /* defined(S2N_CPUID_AVAILABLE) */ -bool s2n_sikep434r2_asm_is_enabled() { - return sikep434r2_asm_enabled; +bool s2n_sikep434r3_asm_is_enabled() { + return sikep434r3_asm_enabled; +} + +bool s2n_bike_r3_is_avx2_enabled() { + return bike_r3_avx2_enabled; +} + +bool s2n_bike_r3_is_avx512_enabled() { + return bike_r3_avx512_enabled; +} + +bool s2n_bike_r3_is_pclmul_enabled() { + return bike_r3_pclmul_enabled; +} + +bool s2n_bike_r3_is_vpclmul_enabled() { + return bike_r3_vpclmul_enabled; +} + +bool s2n_kyber512r3_is_avx2_bmi2_enabled() { + return kyber512r3_avx2_bmi2_enabled; } bool s2n_pq_is_enabled() { #if defined(S2N_NO_PQ) return false; #else - return !s2n_is_in_fips_mode(); + /* aws-lc is currently the only supported FIPS library known to support PQ. */ + return s2n_libcrypto_is_awslc() || (!s2n_is_in_fips_mode()); #endif } -S2N_RESULT s2n_disable_sikep434r2_asm() { - sikep434r2_asm_enabled = false; +S2N_RESULT s2n_disable_sikep434r3_asm() { + sikep434r3_asm_enabled = false; + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_disable_bike_r3_opt_all() { + bike_r3_avx2_enabled = false; + bike_r3_avx512_enabled = false; + bike_r3_pclmul_enabled = false; + bike_r3_vpclmul_enabled = false; return S2N_RESULT_OK; } -S2N_RESULT s2n_try_enable_sikep434r2_asm() { - if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r2_asm()) { - sikep434r2_asm_enabled = true; +S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2() { + kyber512r3_avx2_bmi2_enabled = false; + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul() { + if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_pclmul()) { + bike_r3_pclmul_enabled = true; } return S2N_RESULT_OK; } -S2N_RESULT s2n_pq_init() { - ENSURE_OK(s2n_try_enable_sikep434r2_asm(), S2N_ERR_SAFETY); +S2N_RESULT s2n_try_enable_bike_r3_opt_avx2() { + /* When AVX2 is available, PCLMUL is too by default. */ + RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_pclmul(), S2N_ERR_SAFETY); + if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx2()) { + bike_r3_avx2_enabled = true; + } + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_try_enable_bike_r3_opt_avx512() { + /* When AVX512 is available, AVX2 is too by default. */ + RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx2(), S2N_ERR_SAFETY); + if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_avx512()) { + bike_r3_avx512_enabled = true; + } + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul() { + RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_avx512(), S2N_ERR_SAFETY); + /* Only Enable VPCLMUL if AVX512 is also supported. This is to because the BIKE R3 VPCLMUL requires 512-bit version + * of VPCLMUL, and not the 256-bit version that is available on AMD Zen 3 processors. */ + if (s2n_pq_is_enabled() && s2n_cpu_supports_bike_r3_vpclmul() && s2n_bike_r3_is_avx512_enabled()) { + bike_r3_vpclmul_enabled = true; + } + return S2N_RESULT_OK; +} +S2N_RESULT s2n_try_enable_sikep434r3_asm() { + if (s2n_pq_is_enabled() && s2n_cpu_supports_sikep434r3_asm()) { + sikep434r3_asm_enabled = true; + } + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2() { + if (s2n_pq_is_enabled() && s2n_cpu_supports_kyber512r3_avx2_bmi2()) { + kyber512r3_avx2_bmi2_enabled = true; + } + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_bike_r3_x86_64_opt_init() +{ + /* try_enable_vpclmul function recursively tries to enable + * all the optimizations (avx2, avx512, pclmul, vpclmul), + * so it's sufficient to call only this function. */ + RESULT_ENSURE_OK(s2n_try_enable_bike_r3_opt_vpclmul(), S2N_ERR_SAFETY); + return S2N_RESULT_OK; +} + +S2N_RESULT s2n_pq_init() { + RESULT_ENSURE_OK(s2n_try_enable_sikep434r3_asm(), S2N_ERR_SAFETY); + RESULT_ENSURE_OK(s2n_bike_r3_x86_64_opt_init(), S2N_ERR_SAFETY); + RESULT_ENSURE_OK(s2n_try_enable_kyber512r3_opt_avx2_bmi2(), S2N_ERR_SAFETY); + return S2N_RESULT_OK; } diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h index 7e5d93e991..2af5c4c940 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h +++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq.h @@ -20,8 +20,23 @@ #include "utils/s2n_safety.h" #include "crypto/s2n_fips.h" -bool s2n_sikep434r2_asm_is_enabled(void); +bool s2n_sikep434r3_asm_is_enabled(void); +S2N_RESULT s2n_disable_sikep434r3_asm(void); +S2N_RESULT s2n_try_enable_sikep434r3_asm(void); + +bool s2n_bike_r3_is_avx2_enabled(void); +bool s2n_bike_r3_is_avx512_enabled(void); +bool s2n_bike_r3_is_pclmul_enabled(void); +bool s2n_bike_r3_is_vpclmul_enabled(void); +S2N_RESULT s2n_disable_bike_r3_opt_all(void); +S2N_RESULT s2n_try_enable_bike_r3_opt_pclmul(void); +S2N_RESULT s2n_try_enable_bike_r3_opt_avx2(void); +S2N_RESULT s2n_try_enable_bike_r3_opt_avx512(void); +S2N_RESULT s2n_try_enable_bike_r3_opt_vpclmul(void); + +bool s2n_kyber512r3_is_avx2_bmi2_enabled(void); +S2N_RESULT s2n_try_enable_kyber512r3_opt_avx2_bmi2(void); +S2N_RESULT s2n_disable_kyber512r3_opt_avx2_bmi2(void); + bool s2n_pq_is_enabled(void); -S2N_RESULT s2n_disable_sikep434r2_asm(void); -S2N_RESULT s2n_try_enable_sikep434r2_asm(void); S2N_RESULT s2n_pq_init(void); diff --git a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c index 845def4a31..275a3e132d 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c +++ b/contrib/restricted/aws/s2n/pq-crypto/s2n_pq_random.c @@ -23,21 +23,21 @@ static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_byt static s2n_get_random_bytes_callback s2n_get_random_bytes_cb = s2n_get_random_bytes_default; S2N_RESULT s2n_get_random_bytes(uint8_t *buffer, uint32_t num_bytes) { - ENSURE_REF(buffer); - GUARD_RESULT(s2n_get_random_bytes_cb(buffer, num_bytes)); + RESULT_ENSURE_REF(buffer); + RESULT_GUARD(s2n_get_random_bytes_cb(buffer, num_bytes)); return S2N_RESULT_OK; } static S2N_RESULT s2n_get_random_bytes_default(uint8_t *buffer, uint32_t num_bytes) { struct s2n_blob out = { .data = buffer, .size = num_bytes }; - GUARD_RESULT(s2n_get_private_random_data(&out)); + RESULT_GUARD(s2n_get_private_random_data(&out)); return S2N_RESULT_OK; } S2N_RESULT s2n_set_rand_bytes_callback_for_testing(s2n_get_random_bytes_callback rand_bytes_callback) { - ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST); + RESULT_ENSURE(s2n_in_unit_test(), S2N_ERR_NOT_IN_UNIT_TEST); s2n_get_random_bytes_cb = rand_bytes_callback; diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h index f6674fa2bc..64465f19ed 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/P503_internal_r1.h @@ -150,7 +150,7 @@ void fpdiv2_503(const digit_t* a, digit_t* c); void fpcorrection503(digit_t* a); // 503-bit Montgomery reduction, c = a mod p -void rdc_mont(const digit_t* a, digit_t* c); +void rdc_mont(const dfelm_t ma, felm_t mc); // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 void fpmul503_mont(const felm_t a, const felm_t b, felm_t c); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h index 128a0127bf..983537c2ca 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/fips202_r1.h @@ -7,7 +7,7 @@ #define SHAKE128_RATE 168 #define SHAKE256_RATE 136 -void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); +void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen); void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); #endif // FIPS202_R1_H diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c index 7f3c63fd85..bdf2834121 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sidh_r1.c @@ -63,7 +63,7 @@ int random_mod_order_B(unsigned char* random_digits) unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS-1); clear_words((void*)random_digits, MAXWORDS_ORDER); - GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, nbytes)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, nbytes)); random_digits[nbytes-1] &= MASK_BOB; // Masking last byte return S2N_SUCCESS; diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c index 3122eb6539..ee905ca74a 100644 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r1/sike_r1_kem.c @@ -16,13 +16,13 @@ int SIKE_P503_r1_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { // SIKE's key generation // Outputs: secret key sk (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes) // public key pk (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); digit_t _sk[SECRETKEY_B_BYTES/sizeof(digit_t)]; // Generate lower portion of secret key sk <- s||SK - GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES)); - GUARD(random_mod_order_B((unsigned char*)_sk)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, MSG_BYTES)); + POSIX_GUARD(random_mod_order_B((unsigned char*)_sk)); // Generate public key pk EphemeralKeyGeneration_B(_sk, pk); @@ -40,7 +40,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi // Input: public key pk (SIKE_P503_R1_PUBLIC_KEY_BYTES bytes) // Outputs: shared secret ss (SIKE_P503_R1_SHARED_SECRET_BYTES bytes) // ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); const uint16_t G = 0; const uint16_t H = 1; @@ -55,7 +55,7 @@ int SIKE_P503_r1_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsi unsigned int i; // Generate ephemeralsk <- G(m||pk) mod oA - GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES)); + POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, MSG_BYTES)); memcpy(&temp[MSG_BYTES], pk, SIKE_P503_R1_PUBLIC_KEY_BYTES); cshake256_simple(ephemeralsk.b, SECRETKEY_A_BYTES, G, temp, SIKE_P503_R1_PUBLIC_KEY_BYTES+MSG_BYTES); @@ -82,7 +82,7 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons // Input: secret key sk (SIKE_P503_R1_SECRET_KEY_BYTES = MSG_BYTES + SECRETKEY_B_BYTES + SIKE_P503_R1_PUBLIC_KEY_BYTES bytes) // ciphertext message ct (SIKE_P503_R1_CIPHERTEXT_BYTES = SIKE_P503_R1_PUBLIC_KEY_BYTES + MSG_BYTES bytes) // Outputs: shared secret ss (SIKE_P503_R1_SHARED_SECRET_BYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); const uint16_t G = 0; const uint16_t H = 1; @@ -117,9 +117,13 @@ int SIKE_P503_r1_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, cons // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) EphemeralKeyGeneration_A(ephemeralsk_.d, c0_); - if (memcmp(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES) != 0) { - memcpy(temp, sk, MSG_BYTES); - } + + // Note: This step deviates from the NIST supplied code by using constant time operations. + // We only want to copy the data if c0_ and ct are different + bool dont_copy = s2n_constant_time_equals(c0_, ct, SIKE_P503_R1_PUBLIC_KEY_BYTES); + // The last argument to s2n_constant_time_copy_or_dont is dont and thus prevents the copy when non-zero/true + s2n_constant_time_copy_or_dont(temp, sk, MSG_BYTES, dont_copy); + memcpy(&temp[MSG_BYTES], ct, SIKE_P503_R1_CIPHERTEXT_BYTES); cshake256_simple(ss, SIKE_P503_R1_SHARED_SECRET_BYTES, H, temp, SIKE_P503_R1_CIPHERTEXT_BYTES+MSG_BYTES); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c deleted file mode 100644 index 4288a5d186..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434.c +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: supersingular isogeny parameters and generation of functions for P434 -*********************************************************************************************/ - -#include "P434_api.h" -#include "P434_internal.h" -#include "pq-crypto/s2n_pq.h" - -// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: -// -------------------------------------------------------------------------------------------------- -// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). -// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. -// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. -// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. -// For example, a 434-bit field element is represented with Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits. - -// -// Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1 -// - - -// The constants p434, p434p1, and p434x2 have been duplicated in -// sikep434r2_fp_x64_asm.S. If, for any reason, the constants are changed in -// one file, they should be updated in the other file as well. -const uint64_t p434[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFDC1767AE2FFFFFF, - 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344}; -const uint64_t p434p1[NWORDS64_FIELD] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000, - 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344}; -const uint64_t p434x2[NWORDS64_FIELD] = {0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFB82ECF5C5FFFFFF, - 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688}; -// Order of Alice's subgroup -const uint64_t Alice_order[NWORDS64_ORDER] = {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000}; -// Order of Bob's subgroup -const uint64_t Bob_order[NWORDS64_ORDER] = {0x58AEA3FDC1767AE3, 0xC520567BC65C7831, 0x1773446CFC5FD681, 0x0000000002341F27}; -// Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2), expressed in Montgomery representation -const uint64_t A_gen[6 * NWORDS64_FIELD] = {0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, 0x70E792DC89FA27B1, - 0xF797F526BB48C8CD, 0x2181DB6131AF621F, 0x00000A1C08B1ECC4, // XPA0 - 0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, 0x8CD8E51F7AACFFAA, - 0xA7F424730D7E419F, 0xD671EB919A179E8C, 0x0000FFA26C5A924A, // XPA1 - 0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, 0xE23941F470841B03, - 0x1B63EDA2045538DD, 0x735CFEB0FFD49215, 0x0001C4CB77542876, // XQA0 - 0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, 0x1E2E5D5FF524E374, - 0xE2DDA115260E2995, 0xA6E4B552E2EDE508, 0x00018ECCDDF4B53E, // XQA1 - 0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, 0x60E17AC16D2F82AD, - 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, 0x00022A81D8D55643, // XRA0 - 0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, 0x7799994BAA96E0E4, - 0x044961599E379AF8, 0xDB2B94FBF09F27E2, 0x0000B87FC716C0C6}; // XRA1 -// Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation -const uint64_t B_gen[6 * NWORDS64_FIELD] = {0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, 0x5864A4A69D450C4F, - 0xB883F276A6490D2B, 0x22CC287022D5F5B9, 0x0001BED4772E551F, // XPB0 - 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, - 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XPB1 - 0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, 0x498FF4A4AF60BD62, - 0xB00AD2A708267E8A, 0xF4328294E017837F, 0x000034080181D8AE, // XQB0 - 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, - 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, // XQB1 - 0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, 0x68A2BA8AA262EC9D, - 0x8176F112EA43F45B, 0x02106D022634F504, 0x00007E8A50F02E37, // XRB0 - 0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, 0x2B35A68239D48A53, - 0x445F6FD138407C93, 0xBEF93B29A3F6B54B, 0x000173FA910377D3}; // XRB1 -// Montgomery constant Montgomery_R2 = (2^448)^2 mod p434 -const uint64_t Montgomery_R2[NWORDS64_FIELD] = {0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, 0x175CC6AF8D6C7C0B, - 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, 0x000025A89BCDD12A}; -// Value one in Montgomery representation -const uint64_t Montgomery_one[NWORDS64_FIELD] = {0x000000000000742C, 0x0000000000000000, 0x0000000000000000, 0xB90FF404FC000000, - 0xD801A4FB559FACD4, 0xE93254545F77410C, 0x0000ECEEA7BD2EDA}; - -// Fixed parameters for isogeny tree computation -const unsigned int strat_Alice[MAX_Alice - 1] = { - 48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, - 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, - 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1}; - -const unsigned int strat_Bob[MAX_Bob - 1] = { - 66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, - 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32, 16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, - 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1}; - -// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions -#define fpcopy fpcopy434 -#define fpzero fpzero434 -#define fpadd fpadd434 -#define fpsub fpsub434 -#define fpneg fpneg434 -#define fpdiv2 fpdiv2_434 -#define fpcorrection fpcorrection434 -#define fpmul_mont fpmul434_mont -#define fpsqr_mont fpsqr434_mont -#define fpinv_mont fpinv434_mont -#define fpinv_chain_mont fpinv434_chain_mont -#define fp2copy fp2copy434 -#define fp2zero fp2zero434 -#define fp2add fp2add434 -#define fp2sub fp2sub434 -#define fp2neg fp2neg434 -#define fp2div2 fp2div2_434 -#define fp2correction fp2correction434 -#define fp2mul_mont fp2mul434_mont -#define fp2sqr_mont fp2sqr434_mont -#define fp2inv_mont fp2inv434_mont -#define mp_add_asm mp_add434_asm -#define mp_subaddx2_asm mp_subadd434x2_asm -#define mp_dblsubx2_asm mp_dblsub434x2_asm -#define random_mod_order_A oqs_kem_sidh_p434_random_mod_order_A -#define random_mod_order_B oqs_kem_sidh_p434_random_mod_order_B -#define EphemeralKeyGeneration_A oqs_kem_sidh_p434_EphemeralKeyGeneration_A -#define EphemeralKeyGeneration_B oqs_kem_sidh_p434_EphemeralKeyGeneration_B -#define EphemeralSecretAgreement_A oqs_kem_sidh_p434_EphemeralSecretAgreement_A -#define EphemeralSecretAgreement_B oqs_kem_sidh_p434_EphemeralSecretAgreement_B - -#include "fp.c" -#include "fpx.c" -#include "ec_isogeny.c" -#include "sidh.c" -#include "sike_r2_kem.c" diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h deleted file mode 100644 index bdf3eee8cd..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_api.h +++ /dev/null @@ -1,70 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: API header file for P434 -*********************************************************************************************/ - -#ifndef P434_API_H -#define P434_API_H - -#include "P434_internal.h" - -/*********************** Key encapsulation mechanism API ***********************/ - -#define CRYPTO_SECRETKEYBYTES 374 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes -#define CRYPTO_PUBLICKEYBYTES 330 -#define CRYPTO_BYTES 16 -#define CRYPTO_CIPHERTEXTBYTES 346 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes - -// Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format): -// ---------------------------------------------------------------------- -// Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). -// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion. -// -// Private keys sk consist of the concatenation of a 16-byte random value, a value in the range [0, 2^217-1] and the public key pk. In the SIKE API, -// private keys are encoded in 374 octets in little endian format. -// Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk is encoded in 330 octets. -// Ciphertexts ct consist of the concatenation of a public key value and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets. -// Shared keys ss consist of a value of 16 octets. - -/*********************** Ephemeral key exchange API ***********************/ - -// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. -// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. -// Extended version available at: http://eprint.iacr.org/2016/859 - -// Generation of Alice's secret key -// Outputs random value in [0, 2^216 - 1] to be used as Alice's private key -int oqs_kem_sidh_p434_random_mod_order_A(unsigned char *random_digits); - -// Generation of Bob's secret key -// Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key -int oqs_kem_sidh_p434_random_mod_order_B(unsigned char *random_digits); - -// Alice's ephemeral public key generation -// Input: a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes. -// Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes. -int oqs_kem_sidh_p434_EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA); - -// Bob's ephemeral key-pair generation -// It produces a private key PrivateKeyB and computes the public key PublicKeyB. -// The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes. -// The public key consists of 3 GF(p434^2) elements encoded in 330 bytes. -int oqs_kem_sidh_p434_EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB); - -// Alice's ephemeral shared secret computation -// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB -// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes. -// Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes. -// Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes. -int oqs_kem_sidh_p434_EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA); - -// Bob's ephemeral shared secret computation -// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA -// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes. -// Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes. -// Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes. -int oqs_kem_sidh_p434_EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB); - - -#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h deleted file mode 100644 index 30056d455b..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/P434_internal.h +++ /dev/null @@ -1,225 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: internal header file for P434 -*********************************************************************************************/ - -#ifndef P434_INTERNAL_H -#define P434_INTERNAL_H - -#include "config.h" - -#if (TARGET == TARGET_AMD64) -#define NWORDS_FIELD 7 // Number of words of a 434-bit field element -#define p434_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p434 + 1 -#elif (TARGET == TARGET_x86) -#define NWORDS_FIELD 14 -#define p434_ZERO_WORDS 6 -#elif (TARGET == TARGET_ARM) -#define NWORDS_FIELD 14 -#define p434_ZERO_WORDS 6 -#elif (TARGET == TARGET_ARM64) -#define NWORDS_FIELD 7 -#define p434_ZERO_WORDS 3 -#endif - -// Basic constants - -#define NBITS_FIELD 434 -#define MAXBITS_FIELD 448 -#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements -#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 434-bit field element -#define NBITS_ORDER 256 -#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. -#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 224-bit element -#define MAXBITS_ORDER NBITS_ORDER -#define ALICE 0 -#define BOB 1 -#define OALICE_BITS 216 -#define OBOB_BITS 218 -#define OBOB_EXPON 137 -#define MASK_ALICE 0xFF -#define MASK_BOB 0x01 -#define PRIME p434 -#define PARAM_A 6 -#define PARAM_C 1 -// Fixed parameters for isogeny tree computation -#define MAX_INT_POINTS_ALICE 7 -#define MAX_INT_POINTS_BOB 8 -#define MAX_Alice 108 -#define MAX_Bob 137 -#define MSG_BYTES 16 -#define SECRETKEY_A_BYTES ((OALICE_BITS + 7) / 8) -#define SECRETKEY_B_BYTES ((OBOB_BITS - 1 + 7) / 8) -#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8) - -// SIDH's basic element definitions and point representations - -typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 434-bit field elements (448-bit max.) -typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x434-bit field elements (448-bit max.) -typedef struct felm_s { - felm_t e[2]; -} f2elm_t; // Datatype for representing quadratic extension field elements GF(p434^2) - -typedef struct { - f2elm_t X; - f2elm_t Z; -} point_proj; // Point representation in projective XZ Montgomery coordinates. -typedef point_proj point_proj_t[1]; - -/**************** Function prototypes ****************/ -/************* Multiprecision functions **************/ - -// Copy wordsize digits, c = a, where lng(a) = nwords -void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords); - -// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit -unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); - -// 434-bit multiprecision addition, c = a+b -void mp_add434_asm(const digit_t *a, const digit_t *b, digit_t *c); - -// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit -unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); - -// 2x434-bit multiprecision subtraction followed by addition with p434*2^448, c = a-b+(p434*2^448) if a-b < 0, otherwise c=a-b -void mp_subaddx2_asm(const digit_t *a, const digit_t *b, digit_t *c); -void mp_subadd434x2_asm(const digit_t *a, const digit_t *b, digit_t *c); - -// Double 2x434-bit multiprecision subtraction, c = c-a-b, where c > a and c > b -void mp_dblsub434x2_asm(const digit_t *a, const digit_t *b, digit_t *c); - -// Multiprecision right shift by one -void mp_shiftr1(digit_t *x, const unsigned int nwords); - -// Digit multiplication, digit * digit -> 2-digit result -void digit_x_digit(const digit_t a, const digit_t b, digit_t *c); - -// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); - -/************ Field arithmetic functions *************/ - -// Copy of a field element, c = a -void fpcopy434(const digit_t *a, digit_t *c); - -// Zeroing a field element, a = 0 -void fpzero434(digit_t *a); - -// Modular addition, c = a+b mod p434 -extern void fpadd434(const digit_t *a, const digit_t *b, digit_t *c); -extern void fpadd434_asm(const digit_t *a, const digit_t *b, digit_t *c); - -// Modular subtraction, c = a-b mod p434 -extern void fpsub434(const digit_t *a, const digit_t *b, digit_t *c); -extern void fpsub434_asm(const digit_t *a, const digit_t *b, digit_t *c); - -// Modular negation, a = -a mod p434 -extern void fpneg434(digit_t *a); - -// Modular division by two, c = a/2 mod p434. -void fpdiv2_434(const digit_t *a, digit_t *c); - -// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1]. -void fpcorrection434(digit_t *a); - -// 434-bit Montgomery reduction, c = a mod p -void rdc_mont(const digit_t *a, digit_t *c); - -// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768 -void fpmul434_mont(const digit_t *a, const digit_t *b, digit_t *c); -void mul434_asm(const digit_t *a, const digit_t *b, digit_t *c); -void rdc434_asm(const digit_t *ma, digit_t *mc); - -// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768 -void fpsqr434_mont(const digit_t *ma, digit_t *mc); - -// Conversion to Montgomery representation -void to_mont(const digit_t *a, digit_t *mc); - -// Conversion from Montgomery representation to standard representation -void from_mont(const digit_t *ma, digit_t *c); - -// Field inversion, a = a^-1 in GF(p434) -void fpinv434_mont(digit_t *a); - -// Chain to compute (p434-3)/4 using Montgomery arithmetic -void fpinv434_chain_mont(digit_t *a); - -/************ GF(p^2) arithmetic functions *************/ - -// Copy of a GF(p434^2) element, c = a -void fp2copy434(const f2elm_t *a, f2elm_t *c); - -// Zeroing a GF(p434^2) element, a = 0 -void fp2zero434(f2elm_t *a); - -// GF(p434^2) negation, a = -a in GF(p434^2) -void fp2neg434(f2elm_t *a); - -// GF(p434^2) addition, c = a+b in GF(p434^2) -void fp2add434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); - -// GF(p434^2) subtraction, c = a-b in GF(p434^2) -extern void fp2sub434(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); - -// GF(p434^2) division by two, c = a/2 in GF(p434^2) -void fp2div2_434(const f2elm_t *a, f2elm_t *c); - -// Modular correction, a = a in GF(p434^2) -void fp2correction434(f2elm_t *a); - -// GF(p434^2) squaring using Montgomery arithmetic, c = a^2 in GF(p434^2) -void fp2sqr434_mont(const f2elm_t *a, f2elm_t *c); - -// GF(p434^2) multiplication using Montgomery arithmetic, c = a*b in GF(p434^2) -void fp2mul434_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); - -// Conversion of a GF(p434^2) element to Montgomery representation -void to_fp2mont(const f2elm_t *a, f2elm_t *mc); - -// Conversion of a GF(p434^2) element from Montgomery representation to standard representation -void from_fp2mont(const f2elm_t *ma, f2elm_t *c); - -// GF(p434^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) -void fp2inv434_mont(f2elm_t *a); - -/************ Elliptic curve and isogeny functions *************/ - -// Computes the j-invariant of a Montgomery curve with projective constant. -void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv); - -// Simultaneous doubling and differential addition. -void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24); - -// Doubling of a Montgomery point in projective coordinates (X:Z). -void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24); - -// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. -void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e); - -// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. -void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff); - -// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. -void eval_4_isog(point_proj_t P, f2elm_t *coeff); - -// Tripling of a Montgomery point in projective coordinates (X:Z). -void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus); - -// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. -void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e); - -// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. -void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff); - -// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. -void eval_3_isog(point_proj_t Q, const f2elm_t *coeff); - -// 3-way simultaneous inversion -void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3); - -// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. -void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A); - -#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h deleted file mode 100644 index 6199e5a708..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/config.h +++ /dev/null @@ -1,218 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: configuration file and platform-dependent macros -*********************************************************************************************/ - -#ifndef SIKE_CONFIG_H -#define SIKE_CONFIG_H - -#include <stdint.h> -#include <stdbool.h> -#include <stddef.h> - -// Definition of operating system - -#define OS_WIN 1 -#define OS_LINUX 2 - -#if defined(_WIN32) // Microsoft Windows OS -#define OS_TARGET OS_WIN -#else -#define OS_TARGET OS_LINUX // default to Linux -#endif - -// Definition of compiler (removed in OQS) - -#define COMPILER_GCC 1 -#define COMPILER_CLANG 2 - -#if defined(__GNUC__) // GNU GCC compiler -#define COMPILER COMPILER_GCC -#elif defined(__clang__) // Clang compiler -#define COMPILER COMPILER_CLANG -#else -#error -- "Unsupported COMPILER" -#endif - -// Definition of the targeted architecture and basic data types -#define TARGET_AMD64 1 -#define TARGET_x86 2 -#define TARGET_ARM 3 -#define TARGET_ARM64 4 - -#if defined(__x86_64__) -#define TARGET TARGET_AMD64 -#define RADIX 64 -#define LOG2RADIX 6 -typedef uint64_t digit_t; // Unsigned 64-bit digit -typedef uint32_t hdigit_t; // Unsigned 32-bit digit -#elif defined(__i386__) -#define TARGET TARGET_x86 -#define RADIX 32 -#define LOG2RADIX 5 -typedef uint32_t digit_t; // Unsigned 32-bit digit -typedef uint16_t hdigit_t; // Unsigned 16-bit digit -#elif defined(__arm__) -#define TARGET TARGET_ARM -#define RADIX 32 -#define LOG2RADIX 5 -typedef uint32_t digit_t; // Unsigned 32-bit digit -typedef uint16_t hdigit_t; // Unsigned 16-bit digit -#elif defined(__aarch64__) -#define TARGET TARGET_ARM64 -#define RADIX 64 -#define LOG2RADIX 6 -typedef uint64_t digit_t; // Unsigned 64-bit digit -typedef uint32_t hdigit_t; // Unsigned 32-bit digit -#else -#error-- "Unsupported ARCHITECTURE" -#endif - -#define RADIX64 64 - -// Extended datatype support -#if !defined(S2N_SIKEP434R2_ASM) -typedef uint64_t uint128_t[2]; -#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) -typedef unsigned uint128_t __attribute__((mode(TI))); -#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) -typedef unsigned uint128_t __attribute__((mode(TI))); -#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN) -typedef uint64_t uint128_t[2]; -#endif - -// Macro definitions - -#define NBITS_TO_NBYTES(nbits) (((nbits) + 7) / 8) // Conversion macro from number of bits to number of bytes -#define NBITS_TO_NWORDS(nbits) (((nbits) + (sizeof(digit_t) * 8) - 1) / (sizeof(digit_t) * 8)) // Conversion macro from number of bits to number of computer words -#define NBYTES_TO_NWORDS(nbytes) (((nbytes) + sizeof(digit_t) - 1) / sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words - -// Macro to avoid compiler warnings when detecting unreferenced parameters -#define UNREFERENCED_PARAMETER(PAR) ((void) (PAR)) - -/********************** Constant-time unsigned comparisons ***********************/ - -// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise - -unsigned int is_digit_nonzero_ct(digit_t x) { // Is x != 0? - return (unsigned int) ((x | (0 - x)) >> (RADIX - 1)); -} - -unsigned int is_digit_zero_ct(digit_t x) { // Is x = 0? - return (unsigned int) (1 ^ is_digit_nonzero_ct(x)); -} - -unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) { // Is x < y? - return (unsigned int) ((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX - 1)); -} - -/********************** Macros for platform-dependent operations **********************/ - -#if (!defined(S2N_SIKEP434R2_ASM)) || (TARGET == TARGET_ARM) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - digit_x_digit((multiplier), (multiplicand), &(lo)); - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - { \ - digit_t tempReg = (addend1) + (digit_t)(carryIn); \ - (sumOut) = (addend2) + tempReg; \ - (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); \ - } - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - { \ - digit_t tempReg = (minuend) - (subtrahend); \ - unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) &is_digit_zero_ct(tempReg))); \ - (differenceOut) = tempReg - (digit_t)(borrowIn); \ - (borrowOut) = borrowReg; \ - } - -// Shift right with flexible datatype -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); - -// Shift left with flexible datatype -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift))); - -#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - (lo) = _umul128((multiplier), (multiplicand), (hi)); - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut)); - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut)); - -// Digit shift right -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = __shiftright128((lowIn), (highIn), (shift)); - -// Digit shift left -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = __shiftleft128((lowIn), (highIn), (shift)); - -// 64x64-bit multiplication -#define MUL128(multiplier, multiplicand, product) \ - (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]); - -// 128-bit addition with output carry -#define ADC128(addend1, addend2, carry, addition) \ - (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ - (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]); - -#define MULADD128(multiplier, multiplicand, addend, carry, result) \ - ; \ - { \ - uint128_t product; \ - MUL128(multiplier, multiplicand, product); \ - ADC128(addend, product, carry, result); \ - } - -#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - { \ - uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ - *(hi) = (digit_t)(tempReg >> RADIX); \ - (lo) = (digit_t) tempReg; \ - } - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - { \ - uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ - (carryOut) = (digit_t)(tempReg >> RADIX); \ - (sumOut) = (digit_t) tempReg; \ - } - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - { \ - uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ - (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t) * 8 - 1)); \ - (differenceOut) = (digit_t) tempReg; \ - } - -// Digit shift right -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); - -// Digit shift left -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); - -#endif - -#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c deleted file mode 100644 index 8a3f85e92b..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/ec_isogeny.c +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: elliptic curve and isogeny functions -*********************************************************************************************/ - -void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24) { // Doubling of a Montgomery point in projective coordinates (X:Z). - // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. - // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). - f2elm_t _t0, _t1; - f2elm_t *t0=&_t0, *t1=&_t1; - - fp2sub(&P->X, &P->Z, t0); // t0 = X1-Z1 - fp2add(&P->X, &P->Z, t1); // t1 = X1+Z1 - fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 - fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 - fp2mul_mont(C24, t0, &Q->Z); // Z2 = C24*(X1-Z1)^2 - fp2mul_mont(t1, &Q->Z, &Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 - fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 - fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] - fp2add(&Q->Z, t0, &Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 - fp2mul_mont(&Q->Z, t1, &Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] -} - -void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e) { // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. - // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. - // Output: projective Montgomery x-coordinates Q <- (2^e)*P. - int i; - - copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD); - - for (i = 0; i < e; i++) { - xDBL(Q, Q, A24plus, C24); - } -} - -void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff) { // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. - // Input: projective point of order four P = (X4:Z4). - // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients - // that are used to evaluate the isogeny at a point in eval_4_isog(). - - fp2sub(&P->X, &P->Z, &coeff[1]); // coeff[1] = X4-Z4 - fp2add(&P->X, &P->Z, &coeff[2]); // coeff[2] = X4+Z4 - fp2sqr_mont(&P->Z, &coeff[0]); // coeff[0] = Z4^2 - fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 2*Z4^2 - fp2sqr_mont(&coeff[0], C24); // C24 = 4*Z4^4 - fp2add(&coeff[0], &coeff[0], &coeff[0]); // coeff[0] = 4*Z4^2 - fp2sqr_mont(&P->X, A24plus); // A24plus = X4^2 - fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 - fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 -} - -void eval_4_isog(point_proj_t P, f2elm_t *coeff) { // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined - // by the 3 coefficients in coeff (computed in the function get_4_isog()). - // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). - // Output: the projective point P = phi(P) = (X:Z) in the codomain. - f2elm_t _t0, _t1; - f2elm_t *t0=&_t0, *t1=&_t1; - - fp2add(&P->X, &P->Z, t0); // t0 = X+Z - fp2sub(&P->X, &P->Z, t1); // t1 = X-Z - fp2mul_mont(t0, &coeff[1], &P->X); // X = (X+Z)*coeff[1] - fp2mul_mont(t1, &coeff[2], &P->Z); // Z = (X-Z)*coeff[2] - fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) - fp2mul_mont(t0, &coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) - fp2add(&P->X, &P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] - fp2sub(&P->X, &P->Z, &P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] - fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 - fp2sqr_mont(&P->Z, &P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - fp2add(t1, t0, &P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 - fp2sub(&P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) - fp2mul_mont(&P->X, t1, &P->X); // Xfinal - fp2mul_mont(&P->Z, t0, &P->Z); // Zfinal -} - -void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus) { // Tripling of a Montgomery point in projective coordinates (X:Z). - // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. - // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). - f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6; - f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6; - - fp2sub(&P->X, &P->Z, t0); // t0 = X-Z - fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 - fp2add(&P->X, &P->Z, t1); // t1 = X+Z - fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 - fp2add(t0, t1, t4); // t4 = 2*X - fp2sub(t1, t0, t0); // t0 = 2*Z - fp2sqr_mont(t4, t1); // t1 = 4*X^2 - fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 - fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 - fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 - fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 - fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 - fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 - fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 - fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 - fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] - fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 - fp2sqr_mont(t2, t2); // t2 = t2^2 - fp2mul_mont(t4, t2, &Q->X); // X3 = 2*X*t2 - fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] - fp2sqr_mont(t1, t1); // t1 = t1^2 - fp2mul_mont(t0, t1, &Q->Z); // Z3 = 2*Z*t1 -} - -void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e) { // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. - // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. - // Output: projective Montgomery x-coordinates Q <- (3^e)*P. - int i; - - copy_words((const digit_t *) P, (digit_t *) Q, 2 * 2 * NWORDS_FIELD); - - for (i = 0; i < e; i++) { - xTPL(Q, Q, A24minus, A24plus); - } -} - -void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff) { // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. - // Input: projective point of order three P = (X3:Z3). - // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. - f2elm_t _t0, _t1, _t2, _t3, _t4; - f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4; - - fp2sub(&P->X, &P->Z, &coeff[0]); // coeff0 = X-Z - fp2sqr_mont(&coeff[0], t0); // t0 = (X-Z)^2 - fp2add(&P->X, &P->Z, &coeff[1]); // coeff1 = X+Z - fp2sqr_mont(&coeff[1], t1); // t1 = (X+Z)^2 - fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 - fp2add(&coeff[0], &coeff[1], t3); // t3 = 2*X - fp2sqr_mont(t3, t3); // t3 = 4*X^2 - fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 - fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 - fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 - fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 - fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) - fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 - fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] - fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 - fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) - fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 - fp2mul_mont(t3, t4, A24plus); // A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] -} - -void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) { // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and - // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). - // Inputs: projective points P = (X3:Z3) and Q = (X:Z). - // Output: the projective point Q <- phi(Q) = (X3:Z3). - f2elm_t _t0, _t1, _t2; - f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2; - - fp2add(&Q->X, &Q->Z, t0); // t0 = X+Z - fp2sub(&Q->X, &Q->Z, t1); // t1 = X-Z - fp2mul_mont(t0, &coeff[0], t0); // t0 = coeff0*(X+Z) - fp2mul_mont(t1, &coeff[1], t1); // t1 = coeff1*(X-Z) - fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) - fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) - fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 - fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 - fp2mul_mont(&Q->X, t2, &Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 - fp2mul_mont(&Q->Z, t0, &Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 -} - -void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3) { // 3-way simultaneous inversion - // Input: z1,z2,z3 - // Output: 1/z1,1/z2,1/z3 (override inputs). - f2elm_t _t0, _t1, _t2, _t3; - f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3; - - fp2mul_mont(z1, z2, t0); // t0 = z1*z2 - fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 - fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) - fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) - fp2mul_mont(t2, z2, t3); // t3 = 1/z1 - fp2mul_mont(t2, z1, z2); // z2 = 1/z2 - fp2mul_mont(t0, t1, z3); // z3 = 1/z3 - fp2copy(t3, z1); // z1 = 1/z1 -} - -void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A) { // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. - // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. - // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. - f2elm_t _t0, _t1, one = {0}; - f2elm_t *t0=&_t0, *t1=&_t1; - - fpcopy((const digit_t *) &Montgomery_one, one.e[0]); - fp2add(xP, xQ, t1); // t1 = xP+xQ - fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ - fp2mul_mont(xR, t1, A); // A = xR*t1 - fp2add(t0, A, A); // A = A+t0 - fp2mul_mont(t0, xR, t0); // t0 = t0*xR - fp2sub(A, &one, A); // A = A-1 - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2add(t1, xR, t1); // t1 = t1+xR - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2sqr_mont(A, A); // A = A^2 - fp2inv_mont(t0); // t0 = 1/t0 - fp2mul_mont(A, t0, A); // A = A*t0 - fp2sub(A, t1, A); // Afinal = A-t1 -} - -void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv) { // Computes the j-invariant of a Montgomery curve with projective constant. - // Input: A,C in GF(p^2). - // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. - f2elm_t _t0, _t1; - f2elm_t *t0=&_t0, *t1=&_t1; - - fp2sqr_mont(A, jinv); // jinv = A^2 - fp2sqr_mont(C, t1); // t1 = C^2 - fp2add(t1, t1, t0); // t0 = t1+t1 - fp2sub(jinv, t0, t0); // t0 = jinv-t0 - fp2sub(t0, t1, t0); // t0 = t0-t1 - fp2sub(t0, t1, jinv); // jinv = t0-t1 - fp2sqr_mont(t1, t1); // t1 = t1^2 - fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2sqr_mont(t0, t1); // t1 = t0^2 - fp2mul_mont(t0, t1, t0); // t0 = t0*t1 - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2add(t0, t0, t0); // t0 = t0+t0 - fp2inv_mont(jinv); // jinv = 1/jinv - fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv -} - -void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24) { // Simultaneous doubling and differential addition. - // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. - // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. - f2elm_t _t0, _t1, _t2; - f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2; - - fp2add(&P->X, &P->Z, t0); // t0 = XP+ZP - fp2sub(&P->X, &P->Z, t1); // t1 = XP-ZP - fp2sqr_mont(t0, &P->X); // XP = (XP+ZP)^2 - fp2sub(&Q->X, &Q->Z, t2); // t2 = XQ-ZQ - fp2correction(t2); - fp2add(&Q->X, &Q->Z, &Q->X); // XQ = XQ+ZQ - fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) - fp2sqr_mont(t1, &P->Z); // ZP = (XP-ZP)^2 - fp2mul_mont(t1, &Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) - fp2sub(&P->X, &P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 - fp2mul_mont(&P->X, &P->Z, &P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 - fp2mul_mont(t2, A24, &Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] - fp2sub(t0, t1, &Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) - fp2add(&Q->X, &P->Z, &P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 - fp2add(t0, t1, &Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) - fp2mul_mont(&P->Z, t2, &P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] - fp2sqr_mont(&Q->Z, &Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 - fp2sqr_mont(&Q->X, &Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 - fp2mul_mont(&Q->Z, xPQ, &Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 -} - -static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) { // Swap points. - // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P - for (unsigned int i = 0; i < NWORDS_FIELD; i++) { - digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]); - P->X.e[0][i] = temp ^ P->X.e[0][i]; - Q->X.e[0][i] = temp ^ Q->X.e[0][i]; - temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]); - P->Z.e[0][i] = temp ^ P->Z.e[0][i]; - Q->Z.e[0][i] = temp ^ Q->Z.e[0][i]; - temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]); - P->X.e[1][i] = temp ^ P->X.e[1][i]; - Q->X.e[1][i] = temp ^ Q->X.e[1][i]; - temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]); - P->Z.e[1][i] = temp ^ P->Z.e[1][i]; - Q->Z.e[1][i] = temp ^ Q->Z.e[1][i]; - } -} - -void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A) { - point_proj_t R0 = {0}, R2 = {0}; - f2elm_t _A24 = {0}; - f2elm_t *A24=&_A24; - digit_t mask; - int i, nbits, swap, prevbit = 0; - - if (AliceOrBob == ALICE) { - nbits = OALICE_BITS; - } else { - nbits = OBOB_BITS - 1; - } - - // Initializing constant - fpcopy((const digit_t *) &Montgomery_one, A24->e[0]); - fp2add(A24, A24, A24); - fp2add(A, A24, A24); - fp2div2(A24, A24); - fp2div2(A24, A24); // A24 = (A+2)/4 - - // Initializing points - fp2copy(xQ, &R0->X); - fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R0->Z.e); - fp2copy(xPQ, &R2->X); - fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R2->Z.e); - fp2copy(xP, &R->X); - fpcopy((const digit_t *) &Montgomery_one, (digit_t *) R->Z.e); - fpzero((digit_t *) (R->Z.e)[1]); - - // Main loop - for (i = 0; i < nbits; i++) { - int bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1; - swap = bit ^ prevbit; - prevbit = bit; - mask = 0 - (digit_t) swap; - - swap_points(R, R2, mask); - xDBLADD(R0, R2, &R->X, A24); - fp2mul_mont(&R2->X, &R->Z, &R2->X); - } - swap = 0 ^ prevbit; - mask = 0 - (digit_t) swap; - swap_points(R, R2, mask); -} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h deleted file mode 100644 index 1196bff2c0..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fips202.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef FIPS202_H -#define FIPS202_H - -#define SHAKE256_RATE 136 - -/** Data structure for the state of the SHAKE-256 non-incremental hashing API. */ -typedef struct { -/** Internal state. */ - uint64_t ctx[25]; -} shake256_ctx; - -void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen); - -#endif // FIPS202_H diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c deleted file mode 100644 index 0e09ce25a0..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fp.c +++ /dev/null @@ -1,241 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: Portable C and x86_64 ASM functions for modular arithmetic for P434 -*********************************************************************************************/ - -#include "P434_internal.h" - -// Modular addition, c = a+b mod p434. -// Inputs: a, b in [0, 2*p434-1] -// Output: c in [0, 2*p434-1] -void fpadd434(const digit_t *a, const digit_t *b, digit_t *c) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - fpadd434_asm(a, b, c); - return; - } -#endif - - unsigned int i, carry = 0; - digit_t mask; - - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - carry = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(carry, c[i], ((const digit_t *) p434x2)[i], carry, c[i]); - } - mask = 0 - (digit_t) carry; - - carry = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, c[i], ((const digit_t *) p434x2)[i] & mask, carry, c[i]); - } -} - -// Modular subtraction, c = a-b mod p434. -// Inputs: a, b in [0, 2*p434-1] -// Output: c in [0, 2*p434-1] -void fpsub434(const digit_t *a, const digit_t *b, digit_t *c) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - fpsub434_asm(a, b, c); - return; - } -#endif - - unsigned int i, borrow = 0; - digit_t mask; - - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - mask = 0 - (digit_t) borrow; - - borrow = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(borrow, c[i], ((const digit_t *) p434x2)[i] & mask, borrow, c[i]); - } -} - -// Modular negation, a = -a mod p434. -// Input/output: a in [0, 2*p434-1] -void fpneg434(digit_t *a) { - unsigned int i, borrow = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, ((const digit_t *) p434x2)[i], a[i], borrow, a[i]); - } -} - -// Modular division by two, c = a/2 mod p434. -// Input : a in [0, 2*p434-1] -// Output: c in [0, 2*p434-1] -void fpdiv2_434(const digit_t *a, digit_t *c) { - unsigned int i, carry = 0; - digit_t mask; - - mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p434 - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, a[i], ((const digit_t *) p434)[i] & mask, carry, c[i]); - } - - mp_shiftr1(c, NWORDS_FIELD); -} - -// Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1]. -void fpcorrection434(digit_t *a) { - unsigned int i, borrow = 0; - digit_t mask; - - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, a[i], ((const digit_t *) p434)[i], borrow, a[i]); - } - mask = 0 - (digit_t) borrow; - - borrow = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(borrow, a[i], ((const digit_t *) p434)[i] & mask, borrow, a[i]); - } -} - -// Digit multiplication, digit * digit -> 2-digit result -void digit_x_digit(const digit_t a, const digit_t b, digit_t *c) { - register digit_t al, ah, bl, bh, temp; - digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; - digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4); - - al = a & mask_low; // Low part - ah = a >> (sizeof(digit_t) * 4); // High part - bl = b & mask_low; - bh = b >> (sizeof(digit_t) * 4); - - albl = al * bl; - albh = al * bh; - ahbl = ah * bl; - ahbh = ah * bh; - c[0] = albl & mask_low; // C00 - - res1 = albl >> (sizeof(digit_t) * 4); - res2 = ahbl & mask_low; - res3 = albh & mask_low; - temp = res1 + res2 + res3; - carry = temp >> (sizeof(digit_t) * 4); - c[0] ^= temp << (sizeof(digit_t) * 4); // C01 - - res1 = ahbl >> (sizeof(digit_t) * 4); - res2 = albh >> (sizeof(digit_t) * 4); - res3 = ahbh & mask_low; - temp = res1 + res2 + res3 + carry; - c[1] = temp & mask_low; // C10 - carry = temp & mask_high; - c[1] ^= (ahbh & mask_high) + carry; // C11 -} - -// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - UNREFERENCED_PARAMETER(nwords); - mul434_asm(a, b, c); - return; - } -#endif - - unsigned int i, j, carry; - digit_t t = 0, u = 0, v = 0, UV[2]; - - for (i = 0; i < nwords; i++) { - for (j = 0; j <= i; j++) { - MUL(a[j], b[i - j], UV + 1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - c[i] = v; - v = u; - u = t; - t = 0; - } - - for (i = nwords; i < 2 * nwords - 1; i++) { - for (j = i - nwords + 1; j < nwords; j++) { - MUL(a[j], b[i - j], UV + 1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - c[i] = v; - v = u; - u = t; - t = 0; - } - c[2 * nwords - 1] = v; -} - -// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. -// mc = ma*R^-1 mod p434x2, where R = 2^448. -// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. -// ma is assumed to be in Montgomery representation. -void rdc_mont(const digit_t *ma, digit_t *mc) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - rdc434_asm(ma, mc); - return; - } -#endif - - unsigned int i, j, carry, count = p434_ZERO_WORDS; - digit_t UV[2], t = 0, u = 0, v = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - mc[i] = 0; - } - - for (i = 0; i < NWORDS_FIELD; i++) { - for (j = 0; j < i; j++) { - if (j < (i - p434_ZERO_WORDS + 1)) { - MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - } - ADDC(0, v, ma[i], carry, v); - ADDC(carry, u, 0, carry, u); - t += carry; - mc[i] = v; - v = u; - u = t; - t = 0; - } - - for (i = NWORDS_FIELD; i < 2 * NWORDS_FIELD - 1; i++) { - if (count > 0) { - count -= 1; - } - for (j = i - NWORDS_FIELD + 1; j < NWORDS_FIELD; j++) { - if (j < (NWORDS_FIELD - count)) { - MUL(mc[j], ((const digit_t *) p434p1)[i - j], UV + 1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - } - ADDC(0, v, ma[i], carry, v); - ADDC(carry, u, 0, carry, u); - t += carry; - mc[i - NWORDS_FIELD] = v; - v = u; - u = t; - t = 0; - } - - /* `carry` isn't read after this, but it's still a necessary argument to the macro */ - /* cppcheck-suppress unreadVariable */ - ADDC(0, v, ma[2 * NWORDS_FIELD - 1], carry, v); - mc[NWORDS_FIELD - 1] = v; -} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c deleted file mode 100644 index e5b356b93b..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/fpx.c +++ /dev/null @@ -1,387 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: core functions over GF(p) and GF(p^2) -*********************************************************************************************/ - -// Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes -void fp2_encode(const f2elm_t *x, unsigned char *enc) { - unsigned int i; - f2elm_t t; - - from_fp2mont(x, &t); - for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { - enc[i] = ((unsigned char *) t.e)[i]; - enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *) t.e)[i + MAXBITS_FIELD / 8]; - } -} - -// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation -void fp2_decode(const unsigned char *enc, f2elm_t *x) { - unsigned int i; - - for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++) - ((unsigned char *) x->e)[i] = 0; - for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { - ((unsigned char *) x->e)[i] = enc[i]; - ((unsigned char *) x->e)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; - } - to_fp2mont(x, x); -} - -// Copy a field element, c = a. -__inline void fpcopy(const felm_t a, felm_t c) { - unsigned int i; - - for (i = 0; i < NWORDS_FIELD; i++) - c[i] = a[i]; -} - -// Zero a field element, a = 0. -__inline void fpzero(felm_t a) { - unsigned int i; - - for (i = 0; i < NWORDS_FIELD; i++) - a[i] = 0; -} - -// Conversion to Montgomery representation, -// mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. -// The Montgomery constant R^2 mod p is the global value "Montgomery_R2". -void to_mont(const felm_t a, felm_t mc) { - fpmul_mont(a, (const digit_t *) &Montgomery_R2, mc); -} - -// Conversion from Montgomery representation to standard representation, -// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. -void from_mont(const felm_t ma, felm_t c) { - digit_t one[NWORDS_FIELD] = {0}; - - one[0] = 1; - fpmul_mont(ma, one, c); - fpcorrection(c); -} - -// Copy wordsize digits, c = a, where lng(a) = nwords. -void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords) { - unsigned int i; - - for (i = 0; i < nwords; i++) - c[i] = a[i]; -} - -// Multiprecision multiplication, c = a*b mod p. -void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) { - dfelm_t temp = {0}; - - mp_mul(ma, mb, temp, NWORDS_FIELD); - rdc_mont(temp, mc); -} - -// Multiprecision squaring, c = a^2 mod p. -void fpsqr_mont(const felm_t ma, felm_t mc) { - dfelm_t temp = {0}; - - mp_mul(ma, ma, temp, NWORDS_FIELD); - rdc_mont(temp, mc); -} - -// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. -void fpinv_mont(felm_t a) { - felm_t tt; - - fpcopy(a, tt); - fpinv_chain_mont(tt); - fpsqr_mont(tt, tt); - fpsqr_mont(tt, tt); - fpmul_mont(a, tt, a); -} - -// Copy a GF(p^2) element, c = a. -void fp2copy(const f2elm_t *a, f2elm_t *c) { - fpcopy(a->e[0], c->e[0]); - fpcopy(a->e[1], c->e[1]); -} - -// Zero a GF(p^2) element, a = 0. -void fp2zero(f2elm_t *a) { - fpzero(a->e[0]); - fpzero(a->e[1]); -} - -// GF(p^2) negation, a = -a in GF(p^2). -void fp2neg(f2elm_t *a) { - fpneg(a->e[0]); - fpneg(a->e[1]); -} - -// GF(p^2) addition, c = a+b in GF(p^2). -__inline void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) { - fpadd(a->e[0], b->e[0], c->e[0]); - fpadd(a->e[1], b->e[1], c->e[1]); -} - -// GF(p^2) subtraction, c = a-b in GF(p^2). -__inline void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) { - fpsub(a->e[0], b->e[0], c->e[0]); - fpsub(a->e[1], b->e[1], c->e[1]); -} - -// GF(p^2) division by two, c = a/2 in GF(p^2). -void fp2div2(const f2elm_t *a, f2elm_t *c) { - fpdiv2(a->e[0], c->e[0]); - fpdiv2(a->e[1], c->e[1]); -} - -// Modular correction, a = a in GF(p^2). -void fp2correction(f2elm_t *a) { - fpcorrection(a->e[0]); - fpcorrection(a->e[1]); -} - -// Multiprecision addition, c = a+b. -__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - mp_add_asm(a, b, c); - return; - } -#endif - - mp_add(a, b, c, NWORDS_FIELD); -} - -// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). -// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] -// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -void fp2sqr_mont(const f2elm_t *a, f2elm_t *c) { - felm_t t1, t2, t3; - - mp_addfast(a->e[0], a->e[1], t1); // t1 = a0+a1 - fpsub(a->e[0], a->e[1], t2); // t2 = a0-a1 - mp_addfast(a->e[0], a->e[0], t3); // t3 = 2a0 - fpmul_mont(t1, t2, c->e[0]); // c0 = (a0+a1)(a0-a1) - fpmul_mont(t3, a->e[1], c->e[1]); // c1 = 2a0*a1 -} - -// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. -unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { - unsigned int i, borrow = 0; - - for (i = 0; i < nwords; i++) - SUBC(borrow, a[i], b[i], borrow, c[i]); - - return borrow; -} - -// Multiprecision subtraction followed by addition with p*2^MAXBITS_FIELD, c = a-b+(p*2^MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. -__inline static void mp_subaddfast(const digit_t *a, const digit_t *b, digit_t *c) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - mp_subaddx2_asm(a, b, c); - return; - } -#endif - - felm_t t1; - - digit_t mask = 0 - (digit_t) mp_sub(a, b, c, 2 * NWORDS_FIELD); - for (int i = 0; i < NWORDS_FIELD; i++) - t1[i] = ((const digit_t *) PRIME)[i] & mask; - mp_addfast((digit_t *) &c[NWORDS_FIELD], t1, (digit_t *) &c[NWORDS_FIELD]); -} - -// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -__inline static void mp_dblsubfast(const digit_t *a, const digit_t *b, digit_t *c) { -#if defined(S2N_SIKEP434R2_ASM) - if (s2n_sikep434r2_asm_is_enabled()) { - mp_dblsubx2_asm(a, b, c); - return; - } -#endif - - mp_sub(c, a, c, 2 * NWORDS_FIELD); - mp_sub(c, b, c, 2 * NWORDS_FIELD); -} - -// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). -// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] -// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) { - felm_t t1, t2; - dfelm_t tt1, tt2, tt3; - - mp_addfast(a->e[0], a->e[1], t1); // t1 = a0+a1 - mp_addfast(b->e[0], b->e[1], t2); // t2 = b0+b1 - mp_mul(a->e[0], b->e[0], tt1, NWORDS_FIELD); // tt1 = a0*b0 - mp_mul(a->e[1], b->e[1], tt2, NWORDS_FIELD); // tt2 = a1*b1 - mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1) - mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - mp_subaddfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1 + p*2^MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1 - rdc_mont(tt3, c->e[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - rdc_mont(tt1, c->e[0]); // c[0] = a0*b0 - a1*b1 -} - -// Chain to compute a^(p-3)/4 using Montgomery arithmetic. -void fpinv_chain_mont(felm_t a) { - unsigned int i, j; - - felm_t t[31], tt; - - // Precomputed table - fpsqr_mont(a, tt); - fpmul_mont(a, tt, t[0]); - for (i = 0; i <= 29; i++) - fpmul_mont(t[i], tt, t[i + 1]); - - fpcopy(a, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[5], tt, tt); - for (i = 0; i < 10; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[14], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[3], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[23], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[13], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[24], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[7], tt, tt); - for (i = 0; i < 8; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[12], tt, tt); - for (i = 0; i < 8; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[30], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[1], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[30], tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[21], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[2], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[19], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[1], tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[24], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[26], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[16], tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[10], tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[6], tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[0], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[20], tt, tt); - for (i = 0; i < 8; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[9], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[25], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[30], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[26], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(a, tt, tt); - for (i = 0; i < 7; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[28], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[6], tt, tt); - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[10], tt, tt); - for (i = 0; i < 9; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[22], tt, tt); - for (j = 0; j < 35; j++) { - for (i = 0; i < 6; i++) - fpsqr_mont(tt, tt); - fpmul_mont(t[30], tt, tt); - } - fpcopy(tt, a); -} - -// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). -void fp2inv_mont(f2elm_t *a) { - f2elm_t t1; - - fpsqr_mont(a->e[0], t1.e[0]); // t10 = a0^2 - fpsqr_mont(a->e[1], t1.e[1]); // t11 = a1^2 - fpadd(t1.e[0], t1.e[1], t1.e[0]); // t10 = a0^2+a1^2 - fpinv_mont(t1.e[0]); // t10 = (a0^2+a1^2)^-1 - fpneg(a->e[1]); // a = a0-i*a1 - fpmul_mont(a->e[0], t1.e[0], a->e[0]); - fpmul_mont(a->e[1], t1.e[0], a->e[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 -} - -// Conversion of a GF(p^2) element to Montgomery representation, -// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). -void to_fp2mont(const f2elm_t *a, f2elm_t *mc) { - to_mont(a->e[0], mc->e[0]); - to_mont(a->e[1], mc->e[1]); -} - -// Conversion of a GF(p^2) element from Montgomery representation to standard representation, -// c_i = ma_i*R^(-1) = a_i in GF(p^2). -void from_fp2mont(const f2elm_t *ma, f2elm_t *c) { - from_mont(ma->e[0], c->e[0]); - from_mont(ma->e[1], c->e[1]); -} - -// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. -unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { - unsigned int i, carry = 0; - - for (i = 0; i < nwords; i++) { - /* cppcheck-suppress shiftTooManyBits */ - /* cppcheck-suppress unmatchedSuppression */ - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - -// Multiprecision right shift by one. -void mp_shiftr1(digit_t *x, const unsigned int nwords) { - unsigned int i; - - for (i = 0; i < nwords - 1; i++) { - SHIFTR(x[i + 1], x[i], 1, x[i], RADIX); - } - x[nwords - 1] >>= 1; -} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c deleted file mode 100644 index d3fdbe722c..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sidh.c +++ /dev/null @@ -1,286 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) -*********************************************************************************************/ - -#include "../s2n_pq_random.h" -#include "utils/s2n_safety.h" - -static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR) { // Initialization of basis points - - fpcopy(gen, XP->e[0]); - fpcopy(gen + NWORDS_FIELD, XP->e[1]); - fpcopy(gen + 2 * NWORDS_FIELD, XQ->e[0]); - fpcopy(gen + 3 * NWORDS_FIELD, XQ->e[1]); - fpcopy(gen + 4 * NWORDS_FIELD, XR->e[0]); - fpcopy(gen + 5 * NWORDS_FIELD, XR->e[1]); -} - -int random_mod_order_A(unsigned char *random_digits) { // Generation of Alice's secret key - // Outputs random value in [0, 2^eA - 1] - GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_A_BYTES)); - random_digits[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; // Masking last byte - return S2N_SUCCESS; -} - -int random_mod_order_B(unsigned char *random_digits) { // Generation of Bob's secret key - // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] - GUARD_AS_POSIX(s2n_get_random_bytes(random_digits, SECRETKEY_B_BYTES)); - random_digits[SECRETKEY_B_BYTES - 1] &= MASK_BOB; // Masking last byte - return S2N_SUCCESS; -} - -int EphemeralKeyGeneration_A(const digit_t *PrivateKeyA, unsigned char *PublicKeyA) { // Alice's ephemeral public key generation - // Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. - // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. - point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; - f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0}; - f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A; - unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; - - // Initialize basis points - init_basis((const digit_t *) A_gen, XPA, XQA, XRA); - init_basis((const digit_t *) B_gen, &phiP->X, &phiQ->X, &phiR->X); - fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]); - fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]); - fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]); - - // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 - fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]); - fp2add(A24plus, A24plus, A24plus); - fp2add(A24plus, A24plus, C24); - fp2add(A24plus, C24, A); - fp2add(C24, C24, A24plus); - - // Retrieve kernel point - LADDER3PT(XPA, XQA, XRA, PrivateKeyA, ALICE, R, A); - - // Traverse tree - index = 0; - for (row = 1; row < MAX_Alice; row++) { - while (index < MAX_Alice - row) { - fp2copy(&R->X, &pts[npts]->X); - fp2copy(&R->Z, &pts[npts]->Z); - pts_index[npts++] = index; - m = strat_Alice[ii++]; - xDBLe(R, R, A24plus, C24, (int) (2 * m)); - index += m; - } - get_4_isog(R, A24plus, C24, coeff); - - for (i = 0; i < npts; i++) { - eval_4_isog(pts[i], coeff); - } - eval_4_isog(phiP, coeff); - eval_4_isog(phiQ, coeff); - eval_4_isog(phiR, coeff); - - fp2copy(&pts[npts - 1]->X, &R->X); - fp2copy(&pts[npts - 1]->Z, &R->Z); - index = pts_index[npts - 1]; - npts -= 1; - } - - get_4_isog(R, A24plus, C24, coeff); - eval_4_isog(phiP, coeff); - eval_4_isog(phiQ, coeff); - eval_4_isog(phiR, coeff); - - inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z); - fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X); - fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X); - fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X); - - // Format public key - fp2_encode(&phiP->X, PublicKeyA); - fp2_encode(&phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); - fp2_encode(&phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES); - - return 0; -} - -int EphemeralKeyGeneration_B(const digit_t *PrivateKeyB, unsigned char *PublicKeyB) { // Bob's ephemeral public key generation - // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. - // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. - point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; - f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0}; - f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A; - unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; - - // Initialize basis points - init_basis((const digit_t *) B_gen, XPB, XQB, XRB); - init_basis((const digit_t *) A_gen, &phiP->X, &phiQ->X, &phiR->X); - fpcopy((const digit_t *) &Montgomery_one, (phiP->Z.e)[0]); - fpcopy((const digit_t *) &Montgomery_one, (phiQ->Z.e)[0]); - fpcopy((const digit_t *) &Montgomery_one, (phiR->Z.e)[0]); - - // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 - fpcopy((const digit_t *) &Montgomery_one, A24plus->e[0]); - fp2add(A24plus, A24plus, A24plus); - fp2add(A24plus, A24plus, A24minus); - fp2add(A24plus, A24minus, A); - fp2add(A24minus, A24minus, A24plus); - - // Retrieve kernel point - LADDER3PT(XPB, XQB, XRB, PrivateKeyB, BOB, R, A); - - // Traverse tree - index = 0; - for (row = 1; row < MAX_Bob; row++) { - while (index < MAX_Bob - row) { - fp2copy(&R->X, &pts[npts]->X); - fp2copy(&R->Z, &pts[npts]->Z); - pts_index[npts++] = index; - m = strat_Bob[ii++]; - xTPLe(R, R, A24minus, A24plus, (int) m); - index += m; - } - get_3_isog(R, A24minus, A24plus, coeff); - - for (i = 0; i < npts; i++) { - eval_3_isog(pts[i], coeff); - } - eval_3_isog(phiP, coeff); - eval_3_isog(phiQ, coeff); - eval_3_isog(phiR, coeff); - - fp2copy(&pts[npts - 1]->X, &R->X); - fp2copy(&pts[npts - 1]->Z, &R->Z); - index = pts_index[npts - 1]; - npts -= 1; - } - - get_3_isog(R, A24minus, A24plus, coeff); - eval_3_isog(phiP, coeff); - eval_3_isog(phiQ, coeff); - eval_3_isog(phiR, coeff); - - inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z); - fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X); - fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X); - fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X); - - // Format public key - fp2_encode(&phiP->X, PublicKeyB); - fp2_encode(&phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); - fp2_encode(&phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES); - - return 0; -} - -int EphemeralSecretAgreement_A(const digit_t *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA) { // Alice's ephemeral shared secret computation - // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB - // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. - // Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. - // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes. - point_proj_t R, pts[MAX_INT_POINTS_ALICE]; - f2elm_t coeff[3], PKB[3], _jinv; - f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0}; - f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A; - unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; - - // Initialize images of Bob's basis - fp2_decode(PublicKeyB, &PKB[0]); - fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, &PKB[1]); - fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, &PKB[2]); - - // Initialize constants: A24plus = A+2C, C24 = 4C, where C=1 - get_A(&PKB[0], &PKB[1], &PKB[2], A); - fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, C24->e[0]); - fp2add(A, C24, A24plus); - fpadd(C24->e[0], C24->e[0], C24->e[0]); - - // Retrieve kernel point - LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyA, ALICE, R, A); - - // Traverse tree - index = 0; - for (row = 1; row < MAX_Alice; row++) { - while (index < MAX_Alice - row) { - fp2copy(&R->X, &pts[npts]->X); - fp2copy(&R->Z, &pts[npts]->Z); - pts_index[npts++] = index; - m = strat_Alice[ii++]; - xDBLe(R, R, A24plus, C24, (int) (2 * m)); - index += m; - } - get_4_isog(R, A24plus, C24, coeff); - - for (i = 0; i < npts; i++) { - eval_4_isog(pts[i], coeff); - } - - fp2copy(&pts[npts - 1]->X, &R->X); - fp2copy(&pts[npts - 1]->Z, &R->Z); - index = pts_index[npts - 1]; - npts -= 1; - } - - get_4_isog(R, A24plus, C24, coeff); - fp2add(A24plus, A24plus, A24plus); - fp2sub(A24plus, C24, A24plus); - fp2add(A24plus, A24plus, A24plus); - j_inv(A24plus, C24, jinv); - fp2_encode(jinv, SharedSecretA); // Format shared secret - - return 0; -} - -int EphemeralSecretAgreement_B(const digit_t *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB) { // Bob's ephemeral shared secret computation - // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA - // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. - // Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. - // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes. - point_proj_t R, pts[MAX_INT_POINTS_BOB]; - f2elm_t coeff[3], PKB[3], _jinv; - f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0}; - f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A; - unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; - - // Initialize images of Alice's basis - fp2_decode(PublicKeyA, &PKB[0]); - fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, &PKB[1]); - fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, &PKB[2]); - - // Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1 - get_A(&PKB[0], &PKB[1], &PKB[2], A); - fpadd((const digit_t *) &Montgomery_one, (const digit_t *) &Montgomery_one, A24minus->e[0]); - fp2add(A, A24minus, A24plus); - fp2sub(A, A24minus, A24minus); - - // Retrieve kernel point - LADDER3PT(&PKB[0], &PKB[1], &PKB[2], PrivateKeyB, BOB, R, A); - - // Traverse tree - index = 0; - for (row = 1; row < MAX_Bob; row++) { - while (index < MAX_Bob - row) { - fp2copy(&R->X, &pts[npts]->X); - fp2copy(&R->Z, &pts[npts]->Z); - pts_index[npts++] = index; - m = strat_Bob[ii++]; - xTPLe(R, R, A24minus, A24plus, (int) m); - index += m; - } - get_3_isog(R, A24minus, A24plus, coeff); - - for (i = 0; i < npts; i++) { - eval_3_isog(pts[i], coeff); - } - - fp2copy(&pts[npts - 1]->X, &R->X); - fp2copy(&pts[npts - 1]->Z, &R->Z); - index = pts_index[npts - 1]; - npts -= 1; - } - - get_3_isog(R, A24minus, A24plus, coeff); - fp2add(A24plus, A24minus, A); - fp2add(A, A, A); - fp2sub(A24plus, A24minus, A24plus); - j_inv(A, A24plus, jinv); - fp2_encode(jinv, SharedSecretB); // Format shared secret - - return 0; -} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c deleted file mode 100644 index 7768ad3650..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sike_r2_kem.c +++ /dev/null @@ -1,120 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: supersingular isogeny key encapsulation (SIKE) protocol -*********************************************************************************************/ - -#include <string.h> -#include "../s2n_pq_random.h" -#include "fips202.h" -#include "utils/s2n_safety.h" -#include "tls/s2n_kem.h" -#include "pq-crypto/s2n_pq.h" - -int SIKE_P434_r2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { - // SIKE's key generation - // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) - // public key pk (CRYPTO_PUBLICKEYBYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); - - digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1]; - - // Generate lower portion of secret key sk <- s||SK - GUARD_AS_POSIX(s2n_get_random_bytes(sk, MSG_BYTES)); - GUARD(random_mod_order_B((unsigned char *)_sk)); - - // Generate public key pk - EphemeralKeyGeneration_B(_sk, pk); - - memcpy(sk + MSG_BYTES, _sk, SECRETKEY_B_BYTES); - - // Append public key pk to secret key sk - memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES); - - return 0; -} - -int SIKE_P434_r2_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) { - // SIKE's encapsulation - // Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes) - // Outputs: shared secret ss (CRYPTO_BYTES bytes) - // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); - - union { - unsigned char b[SECRETKEY_A_BYTES]; - digit_t d[SECRETKEY_A_BYTES/sizeof(digit_t)]; - } ephemeralsk; - unsigned char jinvariant[FP2_ENCODED_BYTES]; - unsigned char h[MSG_BYTES]; - unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES]; - - // Generate ephemeralsk <- G(m||pk) mod oA - GUARD_AS_POSIX(s2n_get_random_bytes(temp, MSG_BYTES)); - memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES); - shake256(ephemeralsk.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES); - - /* ephemeralsk is a union; the memory set here through .b will get accessed through the .d member later */ - /* cppcheck-suppress unreadVariable */ - /* cppcheck-suppress unmatchedSuppression */ - ephemeralsk.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; - - // Encrypt - EphemeralKeyGeneration_A(ephemeralsk.d, ct); - EphemeralSecretAgreement_A(ephemeralsk.d, pk, jinvariant); - shake256(h, MSG_BYTES, jinvariant, FP2_ENCODED_BYTES); - for (int i = 0; i < MSG_BYTES; i++) { - ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i]; - } - // Generate shared secret ss <- H(m||ct) - memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); - shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES); - - return 0; -} - -int SIKE_P434_r2_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) { - // SIKE's decapsulation - // Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) - // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) - // Outputs: shared secret ss (CRYPTO_BYTES bytes) - ENSURE_POSIX(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); - - union { - unsigned char b[SECRETKEY_A_BYTES]; - digit_t d[SECRETKEY_A_BYTES/sizeof(digit_t)]; - } ephemeralsk_; - unsigned char jinvariant_[FP2_ENCODED_BYTES]; - unsigned char h_[MSG_BYTES]; - unsigned char c0_[CRYPTO_PUBLICKEYBYTES]; - unsigned char temp[CRYPTO_CIPHERTEXTBYTES + MSG_BYTES]; - - digit_t _sk[(SECRETKEY_B_BYTES / sizeof(digit_t)) + 1]; - memcpy(_sk, sk + MSG_BYTES, SECRETKEY_B_BYTES); - - // Decrypt - EphemeralSecretAgreement_B(_sk, ct, jinvariant_); - shake256(h_, MSG_BYTES, jinvariant_, FP2_ENCODED_BYTES); - for (int i = 0; i < MSG_BYTES; i++) { - temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i]; - } - // Generate ephemeralsk_ <- G(m||pk) mod oA - memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES); - shake256(ephemeralsk_.b, SECRETKEY_A_BYTES, temp, CRYPTO_PUBLICKEYBYTES + MSG_BYTES); - - /* ephemeralsk_ is a union; the memory set here through .b will get accessed through the .d member later */ - /* cppcheck-suppress unreadVariable */ - /* cppcheck-suppress uninitvar */ - /* cppcheck-suppress unmatchedSuppression */ - ephemeralsk_.b[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; - - // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) - EphemeralKeyGeneration_A(ephemeralsk_.d, c0_); - if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) { - memcpy(temp, sk, MSG_BYTES); - } - memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); - shake256(ss, CRYPTO_BYTES, temp, CRYPTO_CIPHERTEXTBYTES + MSG_BYTES); - - return 0; -} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S deleted file mode 100644 index 831fc1b7fb..0000000000 --- a/contrib/restricted/aws/s2n/pq-crypto/sike_r2/sikep434r2_fp_x64_asm.S +++ /dev/null @@ -1,962 +0,0 @@ -//******************************************************************************************* -// SIDH: an efficient supersingular isogeny cryptography library -// -// Abstract: field arithmetic in x64 assembly for P434 on Linux -//******************************************************************************************* - -.intel_syntax noprefix - -/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */ - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx - -// Define addition instructions -#ifdef S2N_ADX - -#define ADD1 adox -#define ADC1 adox -#define ADD2 adcx -#define ADC2 adcx - -#else // S2N_ADX - -#define ADD1 add -#define ADC1 adc -#define ADD2 add -#define ADC2 adc - -#endif // S2N_ADX - -// The constants below (asm_p434, asm_p434p1, and asm_p434x2) are duplicated from -// P434.c, and correspond to the arrays p434, p434p1, and p434x2. The values are -// idenctical; they are just represented here as standard (base 10) ints, instead -// of hex. If, for any reason, the constants are changed in one file, they should be -// updated in the other file as well. - -.text -.align 32 -.type asm_p434, @object -.size asm_p434, 56 -asm_p434: - .quad -1 - .quad -1 - .quad -1 - .quad -161717841442111489 - .quad 8918917783347572387 - .quad 7853257225132122198 - .quad 620258357900100 -.align 32 -.type asm_p434p1, @object -.size asm_p434p1, 56 -asm_p434p1: - .quad 0 - .quad 0 - .quad 0 - .quad -161717841442111488 - .quad 8918917783347572387 - .quad 7853257225132122198 - .quad 620258357900100 -.align 32 -.type asm_p434x2, @object -.size asm_p434x2, 56 -asm_p434x2: - .quad -2 - .quad -1 - .quad -1 - .quad -323435682884222977 - .quad -608908507014406841 - .quad -2740229623445307220 - .quad 1240516715800200 - -//*********************************************************************** -// Field addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global fpadd434_asm -fpadd434_asm: - push r12 - push r13 - push r14 - push r15 - push rbx - push rbp - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - adc r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - - mov rbx, [rip+asm_p434x2] - sub r8, rbx - mov rcx, [rip+asm_p434x2+8] - sbb r9, rcx - sbb r10, rcx - mov rdi, [rip+asm_p434x2+24] - sbb r11, rdi - mov rsi, [rip+asm_p434x2+32] - sbb r12, rsi - mov rbp, [rip+asm_p434x2+40] - sbb r13, rbp - mov r15, [rip+asm_p434x2+48] - sbb r14, r15 - sbb rax, 0 - - and rbx, rax - and rcx, rax - and rdi, rax - and rsi, rax - and rbp, rax - and r15, rax - - add r8, rbx - adc r9, rcx - adc r10, rcx - adc r11, rdi - adc r12, rsi - adc r13, rbp - adc r14, r15 - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - - pop rbp - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - -//*********************************************************************** -// Field subtraction -// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global fpsub434_asm -fpsub434_asm: - push r12 - push r13 - push r14 - - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - mov r13, [reg_p1+40] - mov r14, [reg_p1+48] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb r14, [reg_p2+48] - sbb rax, 0 - - mov rcx, [rip+asm_p434x2] - mov rdi, [rip+asm_p434x2+8] - mov rsi, [rip+asm_p434x2+24] - and rcx, rax - and rdi, rax - and rsi, rax - add r8, rcx - adc r9, rdi - adc r10, rdi - adc r11, rsi - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - setc cl - - mov r8, [rip+asm_p434x2+32] - mov rdi, [rip+asm_p434x2+40] - mov rsi, [rip+asm_p434x2+48] - and r8, rax - and rdi, rax - and rsi, rax - bt rcx, 0 - adc r12, r8 - adc r13, rdi - adc r14, rsi - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], r14 - - pop r14 - pop r13 - pop r12 - ret - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication, a full row at a time -// Inputs: memory pointers M0 and M1 -// Outputs: memory pointer C -// Temps: regs T0:T9 -///////////////////////////////////////////////////////////////// - -#ifdef S2N_ADX -.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - xor rax, rax - adox \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adox \T1, \T3 - - mov rdx, 8\M0 - mulx \T3, \T4, \M1 // T3:T4 = A1*B0 - adox \T2, rax - xor rax, rax - mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 - adox \T4, \T0 - mov 8\C, \T4 // C1_final - adcx \T3, \T6 - mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2 - adox \T3, \T1 - adcx \T5, \T0 - adcx \T6, rax - adox \T5, \T2 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - adox \T6, rax - xor rax, rax - mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1 - adox \T0, \T3 - mov 16\C, \T0 // C2_final - adcx \T1, \T5 - mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2 - adcx \T4, \T6 - adcx \T0, rax - adox \T1, \T2 - adox \T3, \T4 - adox \T0, rax - mov 24\C, \T1 // C3_final - mov 32\C, \T3 // C4_final - mov 40\C, \T0 // C5_final -.endm - -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - xor rax, rax - adox \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adox \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adox \T2, \T4 - - mov rdx, 8\M0 - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - adox \T3, rax - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - adox \T4, \T0 - mov 8\C, \T4 // C1_final - adcx \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adcx \T6, \T8 - adox \T5, \T1 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adcx \T7, \T9 - adcx \T8, rax - adox \T6, \T2 - - mov rdx, 16\M0 - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - adox \T7, \T3 - adox \T8, rax - xor rax, rax - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - adox \T0, \T5 - mov 16\C, \T0 // C2_final - adcx \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adcx \T2, \T4 - adox \T1, \T6 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adcx \T3, \T9 - mov rdx, 24\M0 - adcx \T4, rax - - adox \T2, \T7 - adox \T3, \T8 - adox \T4, rax - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - xor rax, rax - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - adcx \T5, \T7 - adox \T1, \T0 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adcx \T6, \T8 - adox \T2, \T5 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adcx \T7, \T9 - adcx \T8, rax - - adox \T3, \T6 - adox \T4, \T7 - adox \T8, rax - mov 24\C, \T1 // C3_final - mov 32\C, \T2 // C4_final - mov 40\C, \T3 // C5_final - mov 48\C, \T4 // C6_final - mov 56\C, \T8 // C7_final -.endm - -#else // S2N_ADX - -.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - add \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adc \T1, \T3 - - mov rdx, 8\M0 - mulx \T3, \T4, \M1 // T3:T4 = A1*B0 - adc \T2, 0 - mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 - add \T4, \T0 - mov 8\C, \T4 // C1_final - adc \T3, \T1 - adc \T5, \T2 - mulx \T0, \T1, 16\M1 // T0:T1 = A1*B2 - adc \T0, 0 - - add \T3, \T6 - adc \T5, \T1 - adc \T0, 0 - - mov rdx, 16\M0 - mulx \T1, \T2, \M1 // T1:T2 = A2*B0 - add \T2, \T3 - mov 16\C, \T2 // C2_final - mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1 - adc \T1, \T5 - adc \T0, \T4 - mulx \T2, \T3, 16\M1 // T0:T3 = A2*B2 - adc \T2, 0 - add \T1, \T6 - adc \T0, \T3 - adc \T2, 0 - mov 24\C, \T1 // C3_final - mov 32\C, \T0 // C4_final - mov 40\C, \T2 // C5_final -.endm - -.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 - mov rdx, \M0 - mulx \T0, \T1, \M1 // T0:T1 = A0*B0 - mov \C, \T1 // C0_final - mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 - add \T0, \T2 - mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 - adc \T1, \T3 - mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 - adc \T2, \T4 - mov rdx, 8\M0 - adc \T3, 0 - - mulx \T5, \T4, \M1 // T5:T4 = A1*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T4, \T0 - mov 8\C, \T4 // C1_final - adc \T5, \T1 - adc \T6, \T2 - adc \T7, \T3 - mov rdx, 16\M0 - adc \T8, 0 - - mulx \T1, \T0, \M1 // T1:T0 = A2*B0 - mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 - add \T1, \T3 - mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 - adc \T2, \T4 - mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 - adc \T3, \T9 - mov rdx, 24\M0 - adc \T4, 0 - - add \T0, \T5 - mov 16\C, \T0 // C2_final - adc \T1, \T6 - adc \T2, \T7 - adc \T3, \T8 - adc \T4, 0 - - mulx \T5, \T0, \M1 // T5:T0 = A3*B0 - mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 - add \T5, \T7 - mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 - adc \T6, \T8 - mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 - adc \T7, \T9 - adc \T8, 0 - - add \T1, \T0 - mov 24\C, \T1 // C3_final - adc \T2, \T5 - mov 32\C, \T2 // C4_final - adc \T3, \T6 - mov 40\C, \T3 // C5_final - adc \T4, \T7 - mov 48\C, \T4 // C6_final - adc \T8, 0 - mov 56\C, \T8 // C7_final -.endm -#endif // S2N_ADX - -//***************************************************************************** -// 434-bit multiplication using Karatsuba (one level), schoolbook (one level) -//***************************************************************************** -.global mul434_asm -mul434_asm: - push r12 - push r13 - push r14 - push r15 - mov rcx, reg_p3 - - // r8-r11 <- AH + AL, rax <- mask - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - push rbx - push rbp - sub rsp, 96 - add r8, [reg_p1+32] - adc r9, [reg_p1+40] - adc r10, [reg_p1+48] - adc r11, 0 - sbb rax, 0 - mov [rsp], r8 - mov [rsp+8], r9 - mov [rsp+16], r10 - mov [rsp+24], r11 - - // r12-r15 <- BH + BL, rbx <- mask - xor rbx, rbx - mov r12, [reg_p2] - mov r13, [reg_p2+8] - mov r14, [reg_p2+16] - mov r15, [reg_p2+24] - add r12, [reg_p2+32] - adc r13, [reg_p2+40] - adc r14, [reg_p2+48] - adc r15, 0 - sbb rbx, 0 - mov [rsp+32], r12 - mov [rsp+40], r13 - mov [rsp+48], r14 - mov [rsp+56], r15 - - // r12-r15 <- masked (BH + BL) - and r12, rax - and r13, rax - and r14, rax - and r15, rax - - // r8-r11 <- masked (AH + AL) - and r8, rbx - and r9, rbx - and r10, rbx - and r11, rbx - - // r8-r11 <- masked (AH + AL) + masked (BH + BL) - add r8, r12 - adc r9, r13 - adc r10, r14 - adc r11, r15 - mov [rsp+64], r8 - mov [rsp+72], r9 - mov [rsp+80], r10 - mov [rsp+88], r11 - - // [rsp] <- (AH+AL) x (BH+BL), low part - MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp - - // [rcx] <- AL x BL - MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 - - // [rcx+64] <- AH x BH - MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14 - - // r8-r11 <- (AH+AL) x (BH+BL), final step - mov r8, [rsp+64] - mov r9, [rsp+72] - mov r10, [rsp+80] - mov r11, [rsp+88] - mov rax, [rsp+32] - add r8, rax - mov rax, [rsp+40] - adc r9, rax - mov rax, [rsp+48] - adc r10, rax - mov rax, [rsp+56] - adc r11, rax - - // [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - mov r12, [rsp] - mov r13, [rsp+8] - mov r14, [rsp+16] - mov r15, [rsp+24] - sub r12, [rcx] - sbb r13, [rcx+8] - sbb r14, [rcx+16] - sbb r15, [rcx+24] - sbb r8, [rcx+32] - sbb r9, [rcx+40] - sbb r10, [rcx+48] - sbb r11, [rcx+56] - - // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - sub r12, [rcx+64] - sbb r13, [rcx+72] - sbb r14, [rcx+80] - sbb r15, [rcx+88] - sbb r8, [rcx+96] - sbb r9, [rcx+104] - sbb r10, 0 - sbb r11, 0 - - add r12, [rcx+32] - mov [rcx+32], r12 // Result C4-C7 - adc r13, [rcx+40] - mov [rcx+40], r13 - adc r14, [rcx+48] - mov [rcx+48], r14 - adc r15, [rcx+56] - mov [rcx+56], r15 - adc r8, [rcx+64] - mov [rcx+64], r8 // Result C8-C15 - adc r9, [rcx+72] - mov [rcx+72], r9 - adc r10, [rcx+80] - mov [rcx+80], r10 - adc r11, [rcx+88] - mov [rcx+88], r11 - mov r12, [rcx+96] - adc r12, 0 - mov [rcx+96], r12 - mov r13, [rcx+104] - adc r13, 0 - mov [rcx+104], r13 - - add rsp, 96 - pop rbp - pop rbx - pop r15 - pop r14 - pop r13 - pop r12 - ret - -///////////////////////////////////////////////////////////////// MACRO -// Schoolbook integer multiplication -// Inputs: memory pointers M0 and M1 -// Outputs: regs T0:T5 -// Temps: regs T7:T6 -///////////////////////////////////////////////////////////////// -.macro MUL64x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5 - mov rdx, \M0 - mulx \T1, \T0, \M1 // T0 <- C0_final - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - ADD1 \T1, \T4 // T1 <- C1_final - ADC1 \T2, \T5 // T2 <- C2_final - mulx \T4, \T5, 24\M1 - ADC1 \T3, \T5 // T3 <- C3_final - ADC1 \T4, rax // T4 <- C4_final -.endm - -#ifdef S2N_ADX -.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T1, \T0, \M1 // T0 <- C0_final - mulx \T2, \T4, 8\M1 - xor rax, rax - mulx \T3, \T5, 16\M1 - ADD1 \T1, \T4 - ADC1 \T2, \T5 - mulx \T4, \T5, 24\M1 - ADC1 \T3, \T5 - ADC1 \T4, rax - - xor rax, rax - mov rdx, 8\M0 - mulx \T6, \T5, \M1 - ADD2 \T1, \T5 // T1 <- C1_final - ADC2 \T2, \T6 - mulx \T5, \T6, 8\M1 - ADC2 \T3, \T5 - ADD1 \T2, \T6 - mulx \T5, \T6, 16\M1 - ADC2 \T4, \T5 - ADC1 \T3, \T6 - mulx \T5, \T6, 24\M1 - ADC2 \T5, rax - ADC1 \T4, \T6 - ADC1 \T5, rax -.endm - -#else // S2N_ADX - -.macro MUL128x256_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6 - mov rdx, \M0 - mulx \T1, \T0, \M1 // T0 <- C0_final - mulx \T2, \T4, 8\M1 - mulx \T3, \T5, 16\M1 - add \T1, \T4 - adc \T2, \T5 - mulx \T4, \T5, 24\M1 - adc \T3, \T5 - adc \T4, 0 - - mov rdx, 8\M0 - mulx \T6, \T5, \M1 - add \T1, \T5 // T1 <- C1_final - adc \T2, \T6 - mulx \T5, \T6, 8\M1 - adc \T3, \T5 - mulx \T5, rax, 16\M1 - adc \T4, \T5 - mulx \T5, rdx, 24\M1 - adc \T5, 0 - add \T2, \T6 - adc \T3, rax - adc \T4, rdx - adc \T5, 0 -.endm -#endif // S2N_ADX - -//************************************************************************************** -// Montgomery reduction -// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -// Operation: c [reg_p2] = a [reg_p1] -// NOTE: a=c is not allowed -//************************************************************************************** -.global rdc434_asm -rdc434_asm: - push r12 - push r13 - - // a[0-1] x p434p1_nz --> result: r8:r13 - MUL128x256_SCHOOL [reg_p1], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx - - xor rcx, rcx - add r8, [reg_p1+24] - adc r9, [reg_p1+32] - adc r10, [reg_p1+40] - adc r11, [reg_p1+48] - adc r12, [reg_p1+56] - adc r13, [reg_p1+64] - adc rcx, [reg_p1+72] - mov [reg_p1+24], r8 - mov [reg_p1+32], r9 - mov [reg_p1+40], r10 - mov [reg_p1+48], r11 - mov [reg_p1+56], r12 - mov [reg_p1+64], r13 - mov [reg_p1+72], rcx - mov r8, [reg_p1+80] - mov r9, [reg_p1+88] - mov r10, [reg_p1+96] - mov r11, [reg_p1+104] - adc r8, 0 - adc r9, 0 - adc r10, 0 - adc r11, 0 - mov [reg_p1+80], r8 - mov [reg_p1+88], r9 - mov [reg_p1+96], r10 - mov [reg_p1+104], r11 - - // a[2-3] x p434p1_nz --> result: r8:r13 - MUL128x256_SCHOOL [reg_p1+16], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx - - xor rcx, rcx - add r8, [reg_p1+40] - adc r9, [reg_p1+48] - adc r10, [reg_p1+56] - adc r11, [reg_p1+64] - adc r12, [reg_p1+72] - adc r13, [reg_p1+80] - adc rcx, [reg_p1+88] - mov [reg_p1+40], r8 - mov [reg_p1+48], r9 - mov [reg_p1+56], r10 - mov [reg_p1+64], r11 - mov [reg_p1+72], r12 - mov [reg_p1+80], r13 - mov [reg_p1+88], rcx - mov r8, [reg_p1+96] - mov r9, [reg_p1+104] - adc r8, 0 - adc r9, 0 - mov [reg_p1+96], r8 - mov [reg_p1+104], r9 - - // a[4-5] x p434p1_nz --> result: r8:r13 - MUL128x256_SCHOOL [reg_p1+32], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13, rcx - - xor rcx, rcx - add r8, [reg_p1+56] - adc r9, [reg_p1+64] - adc r10, [reg_p1+72] - adc r11, [reg_p1+80] - adc r12, [reg_p1+88] - adc r13, [reg_p1+96] - adc rcx, [reg_p1+104] - mov [reg_p2], r8 // Final result c0-c1 - mov [reg_p2+8], r9 - mov [reg_p1+72], r10 - mov [reg_p1+80], r11 - mov [reg_p1+88], r12 - mov [reg_p1+96], r13 - mov [reg_p1+104], rcx - - // a[6-7] x p434p1_nz --> result: r8:r12 - MUL64x256_SCHOOL [reg_p1+48], [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13 - - // Final result c2:c6 - add r8, [reg_p1+72] - adc r9, [reg_p1+80] - adc r10, [reg_p1+88] - adc r11, [reg_p1+96] - adc r12, [reg_p1+104] - mov [reg_p2+16], r8 - mov [reg_p2+24], r9 - mov [reg_p2+32], r10 - mov [reg_p2+40], r11 - mov [reg_p2+48], r12 - - pop r13 - pop r12 - ret - -//*********************************************************************** -// 434-bit multiprecision addition -// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -//*********************************************************************** -.global mp_add434_asm -mp_add434_asm: - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - add r8, [reg_p2] - adc r9, [reg_p2+8] - adc r10, [reg_p2+16] - adc r11, [reg_p2+24] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - - mov r8, [reg_p1+32] - mov r9, [reg_p1+40] - mov r10, [reg_p1+48] - adc r8, [reg_p2+32] - adc r9, [reg_p2+40] - adc r10, [reg_p2+48] - mov [reg_p3+32], r8 - mov [reg_p3+40], r9 - mov [reg_p3+48], r10 - ret - -//*********************************************************************** -// 2x434-bit multiprecision subtraction/addition -// Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p434*2^448 -//*********************************************************************** -.global mp_subadd434x2_asm -mp_subadd434x2_asm: - push r12 - push r13 - push r14 - push r15 - xor rax, rax - mov r8, [reg_p1] - mov r9, [reg_p1+8] - mov r10, [reg_p1+16] - mov r11, [reg_p1+24] - mov r12, [reg_p1+32] - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - - mov r8, [reg_p1+40] - mov r9, [reg_p1+48] - mov r10, [reg_p1+56] - mov r11, [reg_p1+64] - mov r12, [reg_p1+72] - sbb r8, [reg_p2+40] - sbb r9, [reg_p2+48] - sbb r10, [reg_p2+56] - sbb r11, [reg_p2+64] - sbb r12, [reg_p2+72] - mov [reg_p3+40], r8 - mov [reg_p3+48], r9 - mov [reg_p3+56], r10 - - mov r13, [reg_p1+80] - mov r14, [reg_p1+88] - mov r15, [reg_p1+96] - mov rcx, [reg_p1+104] - sbb r13, [reg_p2+80] - sbb r14, [reg_p2+88] - sbb r15, [reg_p2+96] - sbb rcx, [reg_p2+104] - sbb rax, 0 - - // Add p434 anded with the mask in rax - mov r8, [rip+asm_p434] - mov r9, [rip+asm_p434+24] - mov r10, [rip+asm_p434+32] - mov rdi, [rip+asm_p434+40] - mov rsi, [rip+asm_p434+48] - and r8, rax - and r9, rax - and r10, rax - and rdi, rax - and rsi, rax - mov rax, [reg_p3+56] - add rax, r8 - adc r11, r8 - adc r12, r8 - adc r13, r9 - adc r14, r10 - adc r15, rdi - adc rcx, rsi - - mov [reg_p3+56], rax - mov [reg_p3+64], r11 - mov [reg_p3+72], r12 - mov [reg_p3+80], r13 - mov [reg_p3+88], r14 - mov [reg_p3+96], r15 - mov [reg_p3+104], rcx - pop r15 - pop r14 - pop r13 - pop r12 - ret - -//*********************************************************************** -// Double 2x434-bit multiprecision subtraction -// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -//*********************************************************************** -.global mp_dblsub434x2_asm -mp_dblsub434x2_asm: - push r12 - push r13 - - xor rax, rax - mov r8, [reg_p3] - mov r9, [reg_p3+8] - mov r10, [reg_p3+16] - mov r11, [reg_p3+24] - mov r12, [reg_p3+32] - mov r13, [reg_p3+40] - mov rcx, [reg_p3+48] - sub r8, [reg_p1] - sbb r9, [reg_p1+8] - sbb r10, [reg_p1+16] - sbb r11, [reg_p1+24] - sbb r12, [reg_p1+32] - sbb r13, [reg_p1+40] - sbb rcx, [reg_p1+48] - adc rax, 0 - sub r8, [reg_p2] - sbb r9, [reg_p2+8] - sbb r10, [reg_p2+16] - sbb r11, [reg_p2+24] - sbb r12, [reg_p2+32] - sbb r13, [reg_p2+40] - sbb rcx, [reg_p2+48] - adc rax, 0 - mov [reg_p3], r8 - mov [reg_p3+8], r9 - mov [reg_p3+16], r10 - mov [reg_p3+24], r11 - mov [reg_p3+32], r12 - mov [reg_p3+40], r13 - mov [reg_p3+48], rcx - - mov r8, [reg_p3+56] - mov r9, [reg_p3+64] - mov r10, [reg_p3+72] - mov r11, [reg_p3+80] - mov r12, [reg_p3+88] - mov r13, [reg_p3+96] - mov rcx, [reg_p3+104] - sub r8, rax - sbb r8, [reg_p1+56] - sbb r9, [reg_p1+64] - sbb r10, [reg_p1+72] - sbb r11, [reg_p1+80] - sbb r12, [reg_p1+88] - sbb r13, [reg_p1+96] - sbb rcx, [reg_p1+104] - sub r8, [reg_p2+56] - sbb r9, [reg_p2+64] - sbb r10, [reg_p2+72] - sbb r11, [reg_p2+80] - sbb r12, [reg_p2+88] - sbb r13, [reg_p2+96] - sbb rcx, [reg_p2+104] - mov [reg_p3+56], r8 - mov [reg_p3+64], r9 - mov [reg_p3+72], r10 - mov [reg_p3+80], r11 - mov [reg_p3+88], r12 - mov [reg_p3+96], r13 - mov [reg_p3+104], rcx - - pop r13 - pop r12 - ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c new file mode 100644 index 0000000000..7ce71ae3d3 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.c @@ -0,0 +1,146 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: supersingular isogeny parameters and generation of functions for P434 +*********************************************************************************************/ + +#include "sikep434r3.h" + +/* Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: + * + * Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at + * the leftmost position (i.e., little endian format). Elements (a+b*i) over GF(p^2), where a and b are + * defined over GF(p), are encoded as {a, b}, with a in the least significant position. Elliptic curve + * points P = (x,y) are encoded as {x, y}, with x in the least significant position. Internally, the + * number of digits used to represent all these elements is obtained by approximating the number of bits + * to the immediately greater multiple of 32. For example, a 434-bit field element is represented with + * Ceil(434 / 64) = 7 64-bit digits or Ceil(434 / 32) = 14 32-bit digits. + * + * Curve isogeny system "SIDHp434". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over + * GF(p434^2), where A=6, B=1, C=1 and p434 = 2^216*3^137-1 */ + +const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, + 0xFDC1767AE2FFFFFF, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, + 0x0002341F27177344 +}; + +const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, + 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, + 0x0004683E4E2EE688 +}; + +const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, + 0xF705D9EB8BFFFFFF, 0xEF1971E0C562BA8F, 0xB3F17F5A07148159, + 0x0008D07C9C5DCD11 +}; + +const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0xFDC1767AE3000000, 0x7BC65C783158AEA3, 0x6CFC5FD681C52056, + 0x0002341F27177344 +}; + +/* Alice's generator values {XPA0 + XPA1*i, XQA0 + xQA1*i, XRA0 + XRA1*i} in GF(p434^2), + * expressed in Montgomery representation */ +const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0x05ADF455C5C345BF, 0x91935C5CC767AC2B, 0xAFE4E879951F0257, + 0x70E792DC89FA27B1, 0xF797F526BB48C8CD, 0x2181DB6131AF621F, + 0x00000A1C08B1ECC4, /* XPA0 */ + + 0x74840EB87CDA7788, 0x2971AA0ECF9F9D0B, 0xCB5732BDF41715D5, + 0x8CD8E51F7AACFFAA, 0xA7F424730D7E419F, 0xD671EB919A179E8C, + 0x0000FFA26C5A924A, /* XPA1 */ + + 0xFEC6E64588B7273B, 0xD2A626D74CBBF1C6, 0xF8F58F07A78098C7, + 0xE23941F470841B03, 0x1B63EDA2045538DD, 0x735CFEB0FFD49215, + 0x0001C4CB77542876, /* XQA0 */ + + 0xADB0F733C17FFDD6, 0x6AFFBD037DA0A050, 0x680EC43DB144E02F, + 0x1E2E5D5FF524E374, 0xE2DDA115260E2995, 0xA6E4B552E2EDE508, + 0x00018ECCDDF4B53E, /* XQA1 */ + + 0x01BA4DB518CD6C7D, 0x2CB0251FE3CC0611, 0x259B0C6949A9121B, + 0x60E17AC16D2F82AD, 0x3AA41F1CE175D92D, 0x413FBE6A9B9BC4F3, + 0x00022A81D8D55643, /* XRA0 */ + + 0xB8ADBC70FC82E54A, 0xEF9CDDB0D5FADDED, 0x5820C734C80096A0, + 0x7799994BAA96E0E4, 0x044961599E379AF8, 0xDB2B94FBF09F27E2, + 0x0000B87FC716C0C6 /* XRA1 */ +}; + +/* Bob's generator values {XPB0, XQB0, XRB0 + XRB1*i} in GF(p434^2), expressed in Montgomery representation */ +const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0x6E5497556EDD48A3, 0x2A61B501546F1C05, 0xEB919446D049887D, + 0x5864A4A69D450C4F, 0xB883F276A6490D2B, 0x22CC287022D5F5B9, + 0x0001BED4772E551F, /* XPB0 */ + + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, /* XPB1 */ + + 0xFAE2A3F93D8B6B8E, 0x494871F51700FE1C, 0xEF1A94228413C27C, + 0x498FF4A4AF60BD62, 0xB00AD2A708267E8A, 0xF4328294E017837F, + 0x000034080181D8AE, /* XQB0 */ + + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, /* XQB1 */ + + 0x283B34FAFEFDC8E4, 0x9208F44977C3E647, 0x7DEAE962816F4E9A, + 0x68A2BA8AA262EC9D, 0x8176F112EA43F45B, 0x02106D022634F504, + 0x00007E8A50F02E37, /* XRB0 */ + + 0xB378B7C1DA22CCB1, 0x6D089C99AD1D9230, 0xEBE15711813E2369, + 0x2B35A68239D48A53, 0x445F6FD138407C93, 0xBEF93B29A3F6B54B, + 0x000173FA910377D3 /* XRB1 */ +}; + +/* Montgomery constant Montgomery_R2 = (2^448)^2 mod p434 */ +const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0x28E55B65DCD69B30, 0xACEC7367768798C2, 0xAB27973F8311688D, + 0x175CC6AF8D6C7C0B, 0xABCD92BF2DDE347E, 0x69E16A61C7686D9A, + 0x000025A89BCDD12A +}; + +/* Value one in Montgomery representation */ +const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD] = { + 0x000000000000742C, 0x0000000000000000, 0x0000000000000000, + 0xB90FF404FC000000, 0xD801A4FB559FACD4, 0xE93254545F77410C, + 0x0000ECEEA7BD2EDA +}; + +/* Fixed parameters for isogeny tree computation */ +const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1] = { + 48, 28, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 7, 4, + 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, + 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 +}; + +const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1] = { + 66, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, + 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 32, + 16, 8, 4, 3, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, + 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 +}; + +/* Returns true if the machine is big endian */ +bool is_big_endian() +{ + uint16_t i = 1; + uint8_t *ptr = (uint8_t *)&i; + return !(*ptr); +} + +uint32_t bswap32(uint32_t x) +{ + uint32_t i = (x >> 16) | (x << 16); + return ((i & UINT32_C(0xff00ff00)) >> 8) | ((i & UINT32_C(0x00ff00ff)) << 8); +} + +uint64_t bswap64(uint64_t x) +{ + return bswap32(x >> 32) | (((uint64_t)bswap32(x)) << 32); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h new file mode 100644 index 0000000000..5b797b1d7f --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3.h @@ -0,0 +1,181 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: supersingular isogeny parameters, generation of functions for P434; +* configuration and platform-dependent macros +*********************************************************************************************/ + +#pragma once + +#include <stdint.h> +#include <stdbool.h> +#include <stddef.h> + +/* All sikep434r3 functions and global variables in the pq-crypto/sike_r3 directory + * should be defined using this namespace macro to avoid symbol collisions. For example, + * in foo.h, declare a function as follows: + * + * #define foo_function S2N_SIKE_P434_R3_NAMESPACE(foo_function) + * int foo_function(int foo_argument); */ +#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s + +/* Endian-related functionality */ +/* Returns true if the machine is big endian */ +#define is_big_endian S2N_SIKE_P434_R3_NAMESPACE(is_big_endian) +bool is_big_endian(void); + +#define bswap32 S2N_SIKE_P434_R3_NAMESPACE(bswap32) +uint32_t bswap32(uint32_t x); + +#define bswap64 S2N_SIKE_P434_R3_NAMESPACE(bswap64) +uint64_t bswap64(uint64_t x); + +/* Arch specific definitions */ +#define digit_t S2N_SIKE_P434_R3_NAMESPACE(digit_t) +#define hdigit_t S2N_SIKE_P434_R3_NAMESPACE(hdigit_t) +#if defined(_AMD64_) || defined(__x86_64) || defined(__x86_64__) || defined(__aarch64__) || defined(_S390X_) || defined(_ARM64_) || defined(__powerpc64__) || (defined(__riscv) && (__riscv_xlen == 64)) + #define S2N_SIKE_P434_R3_NWORDS_FIELD 7 /* Number of words of a 434-bit field element */ + #define S2N_SIKE_P434_R3_ZERO_WORDS 3 /* Number of "0" digits in the least significant part of p434 + 1 */ + #define S2N_SIKE_P434_R3_RADIX 64 + #define S2N_SIKE_P434_R3_LOG2RADIX 6 + #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i) bswap64((i)) + typedef uint64_t digit_t; + typedef uint32_t hdigit_t; +#elif defined(_X86_) || defined(_ARM_) || defined(__arm__) || defined(__i386__) + #define S2N_SIKE_P434_R3_NWORDS_FIELD 14 /* Number of words of a 434-bit field element */ + #define S2N_SIKE_P434_R3_ZERO_WORDS 6 /* Number of "0" digits in the least significant part of p434 + 1 */ + #define S2N_SIKE_P434_R3_RADIX 32 + #define S2N_SIKE_P434_R3_LOG2RADIX 5 + #define S2N_SIKE_P434_R3_BSWAP_DIGIT(i) bswap32((i)) + typedef uint32_t digit_t; + typedef uint16_t hdigit_t; +#else + #error -- "Unsupported ARCHITECTURE" +#endif + +/* Basic constants */ +#define S2N_SIKE_P434_R3_NBITS_FIELD 434 +#define S2N_SIKE_P434_R3_MAXBITS_FIELD 448 +/* Number of 64-bit words of a 434-bit field element */ +#define S2N_SIKE_P434_R3_NWORDS64_FIELD ((S2N_SIKE_P434_R3_NBITS_FIELD+63)/64) +#define S2N_SIKE_P434_R3_NBITS_ORDER 256 +/* Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. */ +#define S2N_SIKE_P434_R3_NWORDS_ORDER ((S2N_SIKE_P434_R3_NBITS_ORDER+S2N_SIKE_P434_R3_RADIX-1)/S2N_SIKE_P434_R3_RADIX) +#define S2N_SIKE_P434_R3_ALICE 0 +#define S2N_SIKE_P434_R3_BOB 1 +#define S2N_SIKE_P434_R3_OALICE_BITS 216 +#define S2N_SIKE_P434_R3_OBOB_BITS 218 +#define S2N_SIKE_P434_R3_MASK_ALICE 0xFF +#define S2N_SIKE_P434_R3_MASK_BOB 0x01 + +/* Fixed parameters for isogeny tree computation */ +#define S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE 7 +#define S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB 8 +#define S2N_SIKE_P434_R3_MAX_ALICE 108 +#define S2N_SIKE_P434_R3_MAX_BOB 137 +#define S2N_SIKE_P434_R3_MSG_BYTES 16 +#define S2N_SIKE_P434_R3_SECRETKEY_A_BYTES ((S2N_SIKE_P434_R3_OALICE_BITS + 7) / 8) +#define S2N_SIKE_P434_R3_SECRETKEY_B_BYTES ((S2N_SIKE_P434_R3_OBOB_BITS - 1 + 7) / 8) +#define S2N_SIKE_P434_R3_FP2_ENCODED_BYTES (2 * ((S2N_SIKE_P434_R3_NBITS_FIELD + 7) / 8)) + +/* SIDH's basic element definitions and point representations */ +/* Datatype for representing 434-bit field elements (448-bit max.) */ +#define felm_t S2N_SIKE_P434_R3_NAMESPACE(felm_t) +typedef digit_t felm_t[S2N_SIKE_P434_R3_NWORDS_FIELD]; + +/* Datatype for representing double-precision 2x434-bit field elements (2x448-bit max.) */ +#define dfelm_t S2N_SIKE_P434_R3_NAMESPACE(dfelm_t) +typedef digit_t dfelm_t[2*S2N_SIKE_P434_R3_NWORDS_FIELD]; + +/* Datatype for representing quadratic extension field elements GF(p434^2) */ +#define f2elm_t S2N_SIKE_P434_R3_NAMESPACE(f2elm_t) +#define felm_s S2N_SIKE_P434_R3_NAMESPACE(felm_s) +typedef struct felm_s { + felm_t e[2]; +} f2elm_t; + +/* Point representation in projective XZ Montgomery coordinates. */ +#define point_proj S2N_SIKE_P434_R3_NAMESPACE(point_proj) +typedef struct { f2elm_t X; f2elm_t Z; } point_proj; +#define point_proj_t S2N_SIKE_P434_R3_NAMESPACE(point_proj_t) +typedef point_proj point_proj_t[1]; + +/* Macro to avoid compiler warnings when detecting unreferenced parameters */ +#define S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) + +/********************** Constant-time unsigned comparisons ***********************/ +/* The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise */ + +/* Is x != 0? */ +static __inline unsigned int is_digit_nonzero_ct(const digit_t x) +{ + return (unsigned int)((x | (0-x)) >> (S2N_SIKE_P434_R3_RADIX-1)); +} + +/* Is x = 0? */ +static __inline unsigned int is_digit_zero_ct(const digit_t x) +{ + return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); +} + +/* Is x < y? */ +static __inline unsigned int is_digit_lessthan_ct(const digit_t x, const digit_t y) +{ + return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (S2N_SIKE_P434_R3_RADIX-1)); +} + +/* Definitions for generic C implementation */ + +typedef uint64_t uint128_t[2]; + +/* Digit multiplication */ +#define S2N_SIKE_P434_R3_MUL(multiplier, multiplicand, hi, lo) \ + digit_x_digit((multiplier), (multiplicand), &(lo)); + +/* Digit addition with carry */ +#define S2N_SIKE_P434_R3_ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ + { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ + (sumOut) = (addend2) + tempReg; \ + (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } + +/* Digit subtraction with borrow */ +#define S2N_SIKE_P434_R3_SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ + { digit_t tempReg = (minuend) - (subtrahend); \ + unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ + (differenceOut) = tempReg - (digit_t)(borrowIn); \ + (borrowOut) = borrowReg; } + +/* Shift right with flexible datatype */ +#define S2N_SIKE_P434_R3_SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << ((DigitSize) - (shift))); + +/* Fixed parameters for computation */ +#define p434 S2N_SIKE_P434_R3_NAMESPACE(p434) +extern const uint64_t p434[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define p434x2 S2N_SIKE_P434_R3_NAMESPACE(p434x2) +extern const uint64_t p434x2[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define p434x4 S2N_SIKE_P434_R3_NAMESPACE(p434x4) +extern const uint64_t p434x4[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define p434p1 S2N_SIKE_P434_R3_NAMESPACE(p434p1) +extern const uint64_t p434p1[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define A_gen S2N_SIKE_P434_R3_NAMESPACE(A_gen) +extern const uint64_t A_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define B_gen S2N_SIKE_P434_R3_NAMESPACE(B_gen) +extern const uint64_t B_gen[6*S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define Montgomery_R2 S2N_SIKE_P434_R3_NAMESPACE(Montgomery_R2) +extern const uint64_t Montgomery_R2[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define Montgomery_one S2N_SIKE_P434_R3_NAMESPACE(Montgomery_one) +extern const uint64_t Montgomery_one[S2N_SIKE_P434_R3_NWORDS64_FIELD]; + +#define strat_Alice S2N_SIKE_P434_R3_NAMESPACE(strat_Alice) +extern const unsigned int strat_Alice[S2N_SIKE_P434_R3_MAX_ALICE-1]; + +#define strat_Bob S2N_SIKE_P434_R3_NAMESPACE(strat_Bob) +extern const unsigned int strat_Bob[S2N_SIKE_P434_R3_MAX_BOB-1]; diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h new file mode 100644 index 0000000000..cf3c4feb85 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_api.h @@ -0,0 +1,78 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: API header file for P434 +*********************************************************************************************/ + +#pragma once + +#include "sikep434r3.h" + +/*********************** Key encapsulation mechanism API ***********************/ +/* Encoding of keys for KEM-based isogeny system "SIKEp434" (wire format): + * + * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the least + * significant octet is located in the lowest memory address). Elements (a+b*i) over GF(p434^2), + * where a and b are defined over GF(p434), are encoded as {a, b}, with a in the lowest memory portion. + * + * Private keys sk consist of the concatenation of a 16-byte random value, a value in the range + * [0, 2^217-1] and the public key pk. In the SIKE API, private keys are encoded in 374 octets in + * little endian format. Public keys pk consist of 3 elements in GF(p434^2). In the SIKE API, pk + * is encoded in 330 octets. Ciphertexts ct consist of the concatenation of a public key value + * and a 16-byte value. In the SIKE API, ct is encoded in 330 + 16 = 346 octets. Shared keys ss + * consist of a value of 16 octets. */ + +/*********************** Ephemeral key exchange API ***********************/ + +/* SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use + * it with static keys. See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, + * C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. Extended version available at: + * http://eprint.iacr.org/2016/859 */ + +/* Generation of Bob's secret key + * Outputs random value in [0, 2^Floor(Log(2,3^137)) - 1] to be used as Bob's private key */ +#define random_mod_order_B S2N_SIKE_P434_R3_NAMESPACE(random_mod_order_B) +int random_mod_order_B(unsigned char* random_digits); + +/* Alice's ephemeral public key generation + * Input: a private key PrivateKeyA in the range [0, 2^216 - 1], stored in 27 bytes. + * Output: the public key PublicKeyA consisting of 3 GF(p434^2) elements encoded in 330 bytes. */ +#define EphemeralKeyGeneration_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_A) +int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); + +/* Bob's ephemeral key-pair generation + * It produces a private key PrivateKeyB and computes the public key PublicKeyB. + * The private key is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes. + * The public key consists of 3 GF(p434^2) elements encoded in 330 bytes. */ +#define EphemeralKeyGeneration_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralKeyGeneration_B) +int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); + +/* Alice's ephemeral shared secret computation + * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB + * Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^216 - 1], stored in 27 bytes. + * Bob's PublicKeyB consists of 3 GF(p434^2) elements encoded in 330 bytes. + * Output: a shared secret SharedSecretA that consists of one element in GF(p434^2) encoded in 110 bytes. */ +#define EphemeralSecretAgreement_A S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_A) +int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); + +/* Bob's ephemeral shared secret computation + * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA + * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^137)) - 1], stored in 28 bytes. + * Alice's PublicKeyA consists of 3 GF(p434^2) elements encoded in 330 bytes. + * Output: a shared secret SharedSecretB that consists of one element in GF(p434^2) encoded in 110 bytes. */ +#define EphemeralSecretAgreement_B S2N_SIKE_P434_R3_NAMESPACE(EphemeralSecretAgreement_B) +int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); + +/* Encoding of keys for KEX-based isogeny system "SIDHp434" (wire format): + * + * Elements over GF(p434) are encoded in 55 octets in little endian format (i.e., the + * least significant octet is located in the lowest memory address). Elements (a+b*i) + * over GF(p434^2), where a and b are defined over GF(p434), are encoded as {a, b}, with + * a in the lowest memory portion. + * + * Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^216-1] and + * [0, 2^Floor(Log(2,3^137)) - 1], resp. In the SIDH API, Alice's and Bob's private keys + * are encoded in 27 and 28 octets, resp., in little endian format. Public keys PublicKeyA + * and PublicKeyB consist of 3 elements in GF(p434^2). In the SIDH API, they are encoded in + * 330 octets. Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p434^2). + * In the SIDH API, they are encoded in 110 octets. */ diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c new file mode 100644 index 0000000000..e5ae4e7c7e --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.c @@ -0,0 +1,348 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ + +#include "sikep434r3.h" +#include "sikep434r3_fpx.h" +#include "sikep434r3_ec_isogeny.h" + +/* Doubling of a Montgomery point in projective coordinates (X:Z). + * Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + * Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). */ +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24) +{ + f2elm_t _t0, _t1; + f2elm_t *t0=&_t0, *t1=&_t1; + + mp2_sub_p2(&P->X, &P->Z, t0); /* t0 = X1-Z1 */ + mp2_add(&P->X, &P->Z, t1); /* t1 = X1+Z1 */ + fp2sqr_mont(t0, t0); /* t0 = (X1-Z1)^2 */ + fp2sqr_mont(t1, t1); /* t1 = (X1+Z1)^2 */ + fp2mul_mont(C24, t0, &Q->Z); /* Z2 = C24*(X1-Z1)^2 */ + fp2mul_mont(t1, &Q->Z, &Q->X); /* X2 = C24*(X1-Z1)^2*(X1+Z1)^2 */ + mp2_sub_p2(t1, t0, t1); /* t1 = (X1+Z1)^2-(X1-Z1)^2 */ + fp2mul_mont(A24plus, t1, t0); /* t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] */ + mp2_add(&Q->Z, t0, &Q->Z); /* Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 */ + fp2mul_mont(&Q->Z, t1, &Q->Z); /* Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] */ +} + +/* Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + * Output: projective Montgomery x-coordinates Q <- (2^e)*P. */ +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e) +{ + int i; + + copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD); + + for (i = 0; i < e; i++) { + xDBL(Q, Q, A24plus, C24); + } +} + +/* Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + * Input: projective point of order four P = (X4:Z4). + * Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + * that are used to evaluate the isogeny at a point in eval_4_isog(). */ +void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff) +{ + mp2_sub_p2(&P->X, &P->Z, &coeff[1]); /* coeff[1] = X4-Z4 */ + mp2_add(&P->X, &P->Z, &coeff[2]); /* coeff[2] = X4+Z4 */ + fp2sqr_mont(&P->Z, &coeff[0]); /* coeff[0] = Z4^2 */ + mp2_add(&coeff[0], &coeff[0], &coeff[0]); /* coeff[0] = 2*Z4^2 */ + fp2sqr_mont(&coeff[0], C24); /* C24 = 4*Z4^4 */ + mp2_add(&coeff[0], &coeff[0], &coeff[0]); /* coeff[0] = 4*Z4^2 */ + fp2sqr_mont(&P->X, A24plus); /* A24plus = X4^2 */ + mp2_add(A24plus, A24plus, A24plus); /* A24plus = 2*X4^2 */ + fp2sqr_mont(A24plus, A24plus); /* A24plus = 4*X4^4 */ +} + +/* Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + * by the 3 coefficients in coeff (computed in the function get_4_isog()). + * Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + * Output: the projective point P = phi(P) = (X:Z) in the codomain. */ +void eval_4_isog(point_proj_t P, f2elm_t *coeff) +{ + f2elm_t _t0, _t1; + f2elm_t *t0=&_t0, *t1=&_t1; + + mp2_add(&P->X, &P->Z, t0); /* t0 = X+Z */ + mp2_sub_p2(&P->X, &P->Z, t1); /* t1 = X-Z */ + fp2mul_mont(t0, &coeff[1], &P->X); /* X = (X+Z)*coeff[1] */ + fp2mul_mont(t1, &coeff[2], &P->Z); /* Z = (X-Z)*coeff[2] */ + fp2mul_mont(t0, t1, t0); /* t0 = (X+Z)*(X-Z) */ + fp2mul_mont(&coeff[0], t0, t0); /* t0 = coeff[0]*(X+Z)*(X-Z) */ + mp2_add(&P->X, &P->Z, t1); /* t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] */ + mp2_sub_p2(&P->X, &P->Z, &P->Z); /* Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] */ + fp2sqr_mont(t1, t1); /* t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */ + fp2sqr_mont(&P->Z, &P->Z); /* Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 */ + mp2_add(t1, t0, &P->X); /* X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 */ + mp2_sub_p2(&P->Z, t0, t0); /* t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) */ + fp2mul_mont(&P->X, t1, &P->X); /* Xfinal */ + fp2mul_mont(&P->Z, t0, &P->Z); /* Zfinal */ +} + +/* Tripling of a Montgomery point in projective coordinates (X:Z). + * Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + * Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). */ +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus) +{ + f2elm_t _t0, _t1, _t2, _t3, _t4, _t5, _t6; + f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4, *t5=&_t5, *t6=&_t6; + + mp2_sub_p2(&P->X, &P->Z, t0); /* t0 = X-Z */ + fp2sqr_mont(t0, t2); /* t2 = (X-Z)^2 */ + mp2_add(&P->X, &P->Z, t1); /* t1 = X+Z */ + fp2sqr_mont(t1, t3); /* t3 = (X+Z)^2 */ + mp2_add(&P->X, &P->X, t4); /* t4 = 2*X */ + mp2_add(&P->Z, &P->Z, t0); /* t0 = 2*Z */ + fp2sqr_mont(t4, t1); /* t1 = 4*X^2 */ + mp2_sub_p2(t1, t3, t1); /* t1 = 4*X^2 - (X+Z)^2 */ + mp2_sub_p2(t1, t2, t1); /* t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 */ + fp2mul_mont(A24plus, t3, t5); /* t5 = A24plus*(X+Z)^2 */ + fp2mul_mont(t3, t5, t3); /* t3 = A24plus*(X+Z)^4 */ + fp2mul_mont(A24minus, t2, t6); /* t6 = A24minus*(X-Z)^2 */ + fp2mul_mont(t2, t6, t2); /* t2 = A24minus*(X-Z)^4 */ + mp2_sub_p2(t2, t3, t3); /* t3 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */ + mp2_sub_p2(t5, t6, t2); /* t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 */ + fp2mul_mont(t1, t2, t1); /* t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */ + fp2add(t3, t1, t2); /* t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^4 - A24plus*(X+Z)^4 */ + fp2sqr_mont(t2, t2); /* t2 = t2^2 */ + fp2mul_mont(t4, t2, &Q->X); /* X3 = 2*X*t2 */ + fp2sub(t3, t1, t1); /* t1 = A24minus*(X-Z)^4 - A24plus*(X+Z)^4 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] */ + fp2sqr_mont(t1, t1); /* t1 = t1^2 */ + fp2mul_mont(t0, t1, &Q->Z); /* Z3 = 2*Z*t1 */ +} + +/* Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + * Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + * Output: projective Montgomery x-coordinates Q <- (3^e)*P. */ +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e) +{ + int i; + + copy_words((const digit_t*)P, (digit_t*)Q, 2*2*S2N_SIKE_P434_R3_NWORDS_FIELD); + + for (i = 0; i < e; i++) { + xTPL(Q, Q, A24minus, A24plus); + } +} + +/* Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + * Input: projective point of order three P = (X3:Z3). + * Output: the 3-isogenous Montgomery curve with projective coefficient A/C. */ +void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff) +{ + f2elm_t _t0, _t1, _t2, _t3, _t4; + f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3, *t4=&_t4; + + mp2_sub_p2(&P->X, &P->Z, &coeff[0]); /* coeff0 = X-Z */ + fp2sqr_mont(&coeff[0], t0); /* t0 = (X-Z)^2 */ + mp2_add(&P->X, &P->Z, &coeff[1]); /* coeff1 = X+Z */ + fp2sqr_mont(&coeff[1], t1); /* t1 = (X+Z)^2 */ + mp2_add(&P->X, &P->X, t3); /* t3 = 2*X */ + fp2sqr_mont(t3, t3); /* t3 = 4*X^2 */ + fp2sub(t3, t0, t2); /* t2 = 4*X^2 - (X-Z)^2 */ + fp2sub(t3, t1, t3); /* t3 = 4*X^2 - (X+Z)^2 */ + mp2_add(t0, t3, t4); /* t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 */ + mp2_add(t4, t4, t4); /* t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) */ + mp2_add(t1, t4, t4); /* t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 */ + fp2mul_mont(t2, t4, A24minus); /* A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] */ + mp2_add(t1, t2, t4); /* t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 */ + mp2_add(t4, t4, t4); /* t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) */ + mp2_add(t0, t4, t4); /* t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 */ + fp2mul_mont(t3, t4, A24plus); /* A24plus = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] */ +} + +/* Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + * a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + * Inputs: projective points P = (X3:Z3) and Q = (X:Z). + * Output: the projective point Q <- phi(Q) = (X3:Z3). */ +void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) +{ + f2elm_t _t0, _t1, _t2; + f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2; + + mp2_add(&Q->X, &Q->Z, t0); /* t0 = X+Z */ + mp2_sub_p2(&Q->X, &Q->Z, t1); /* t1 = X-Z */ + fp2mul_mont(&coeff[0], t0, t0); /* t0 = coeff0*(X+Z) */ + fp2mul_mont(&coeff[1], t1, t1); /* t1 = coeff1*(X-Z) */ + mp2_add(t0, t1, t2); /* t2 = coeff0*(X+Z) + coeff1*(X-Z) */ + mp2_sub_p2(t1, t0, t0); /* t0 = coeff1*(X-Z) - coeff0*(X+Z) */ + fp2sqr_mont(t2, t2); /* t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 */ + fp2sqr_mont(t0, t0); /* t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 */ + fp2mul_mont(&Q->X, t2, &Q->X); /* X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 */ + fp2mul_mont(&Q->Z, t0, &Q->Z); /* Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 */ +} + +/* 3-way simultaneous inversion + * Input: z1,z2,z3 + * Output: 1/z1,1/z2,1/z3 (override inputs). */ +void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3) +{ + f2elm_t _t0, _t1, _t2, _t3; + f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2, *t3=&_t3; + + fp2mul_mont(z1, z2, t0); /* t0 = z1*z2 */ + fp2mul_mont(z3, t0, t1); /* t1 = z1*z2*z3 */ + fp2inv_mont(t1); /* t1 = 1/(z1*z2*z3) */ + fp2mul_mont(z3, t1, t2); /* t2 = 1/(z1*z2) */ + fp2mul_mont(t2, z2, t3); /* t3 = 1/z1 */ + fp2mul_mont(t2, z1, z2); /* z2 = 1/z2 */ + fp2mul_mont(t0, t1, z3); /* z3 = 1/z3 */ + fp2copy(t3, z1); /* z1 = 1/z1 */ +} + +/* Given the x-coordinates of P, Q, and R, returns the value A corresponding to the + * Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. + * Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. + * Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. */ +void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A) +{ + f2elm_t _t0, _t1, one = {0}; + f2elm_t *t0=&_t0, *t1=&_t1; + + + fpcopy((const digit_t*)&Montgomery_one,one.e[0]); + fp2add(xP, xQ, t1); /* t1 = xP+xQ */ + fp2mul_mont(xP, xQ, t0); /* t0 = xP*xQ */ + fp2mul_mont(xR, t1, A); /* A = xR*t1 */ + fp2add(t0, A, A); /* A = A+t0 */ + fp2mul_mont(t0, xR, t0); /* t0 = t0*xR */ + fp2sub(A, &one, A); /* A = A-1 */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2add(t1, xR, t1); /* t1 = t1+xR */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2sqr_mont(A, A); /* A = A^2 */ + fp2inv_mont(t0); /* t0 = 1/t0 */ + fp2mul_mont(A, t0, A); /* A = A*t0 */ + fp2sub(A, t1, A); /* Afinal = A-t1 */ +} + +/* Computes the j-invariant of a Montgomery curve with projective constant. + * Input: A,C in GF(p^2). + * Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve + * B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. */ +void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv) +{ + f2elm_t _t0, _t1; + f2elm_t *t0=&_t0, *t1=&_t1; + + fp2sqr_mont(A, jinv); /* jinv = A^2 */ + fp2sqr_mont(C, t1); /* t1 = C^2 */ + fp2add(t1, t1, t0); /* t0 = t1+t1 */ + fp2sub(jinv, t0, t0); /* t0 = jinv-t0 */ + fp2sub(t0, t1, t0); /* t0 = t0-t1 */ + fp2sub(t0, t1, jinv); /* jinv = t0-t1 */ + fp2sqr_mont(t1, t1); /* t1 = t1^2 */ + fp2mul_mont(jinv, t1, jinv); /* jinv = jinv*t1 */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2sqr_mont(t0, t1); /* t1 = t0^2 */ + fp2mul_mont(t0, t1, t0); /* t0 = t0*t1 */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2add(t0, t0, t0); /* t0 = t0+t0 */ + fp2inv_mont(jinv); /* jinv = 1/jinv */ + fp2mul_mont(jinv, t0, jinv); /* jinv = t0*jinv */ +} + +/* Simultaneous doubling and differential addition. + * Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, + * affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + * Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, + * and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. */ +static void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t *xPQ, const f2elm_t *A24) +{ + f2elm_t _t0, _t1, _t2; + f2elm_t *t0=&_t0, *t1=&_t1, *t2=&_t2; + + mp2_add(&P->X, &P->Z, t0); /* t0 = XP+ZP */ + mp2_sub_p2(&P->X, &P->Z, t1); /* t1 = XP-ZP */ + fp2sqr_mont(t0, &P->X); /* XP = (XP+ZP)^2 */ + mp2_sub_p2(&Q->X, &Q->Z, t2); /* t2 = XQ-ZQ */ + mp2_add(&Q->X, &Q->Z, &Q->X); /* XQ = XQ+ZQ */ + fp2mul_mont(t0, t2, t0); /* t0 = (XP+ZP)*(XQ-ZQ) */ + fp2sqr_mont(t1, &P->Z); /* ZP = (XP-ZP)^2 */ + fp2mul_mont(t1, &Q->X, t1); /* t1 = (XP-ZP)*(XQ+ZQ) */ + mp2_sub_p2(&P->X, &P->Z, t2); /* t2 = (XP+ZP)^2-(XP-ZP)^2 */ + fp2mul_mont(&P->X, &P->Z, &P->X); /* XP = (XP+ZP)^2*(XP-ZP)^2 */ + fp2mul_mont(A24, t2, &Q->X); /* XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] */ + mp2_sub_p2(t0, t1, &Q->Z); /* ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) */ + mp2_add(&Q->X, &P->Z, &P->Z); /* ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 */ + mp2_add(t0, t1, &Q->X); /* XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) */ + fp2mul_mont(&P->Z, t2, &P->Z); /* ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] */ + fp2sqr_mont(&Q->Z, &Q->Z); /* ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */ + fp2sqr_mont(&Q->X, &Q->X); /* XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 */ + fp2mul_mont(&Q->Z, xPQ, &Q->Z); /* ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 */ +} + +/* Swap points. + * If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P */ +static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) +{ + unsigned int i; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + digit_t temp = option & (P->X.e[0][i] ^ Q->X.e[0][i]); + P->X.e[0][i] = temp ^ P->X.e[0][i]; + Q->X.e[0][i] = temp ^ Q->X.e[0][i]; + temp = option & (P->X.e[1][i] ^ Q->X.e[1][i]); + P->X.e[1][i] = temp ^ P->X.e[1][i]; + Q->X.e[1][i] = temp ^ Q->X.e[1][i]; + temp = option & (P->Z.e[0][i] ^ Q->Z.e[0][i]); + P->Z.e[0][i] = temp ^ P->Z.e[0][i]; + Q->Z.e[0][i] = temp ^ Q->Z.e[0][i]; + temp = option & (P->Z.e[1][i] ^ Q->Z.e[1][i]); + P->Z.e[1][i] = temp ^ P->Z.e[1][i]; + Q->Z.e[1][i] = temp ^ Q->Z.e[1][i]; + } +} + +void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t* m, + const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A) +{ + point_proj_t R0 = {0}, R2 = {0}; + f2elm_t _A24 = {0}; + f2elm_t *A24 = &_A24; + digit_t mask; + int i, nbits, swap, prevbit = 0; + + if (AliceOrBob == S2N_SIKE_P434_R3_ALICE) { + nbits = S2N_SIKE_P434_R3_OALICE_BITS; + } else { + nbits = S2N_SIKE_P434_R3_OBOB_BITS - 1; + } + + /* Initializing constant */ + fpcopy((const digit_t*)&Montgomery_one, A24->e[0]); + mp2_add(A24, A24, A24); + mp2_add(A, A24, A24); + fp2div2(A24, A24); + fp2div2(A24, A24); /* A24 = (A+2)/4 */ + + /* Initializing points */ + fp2copy(xQ, &R0->X); + fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R0->Z); + fp2copy(xPQ, &R2->X); + fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R2->Z); + fp2copy(xP, &R->X); + fpcopy((const digit_t*)&Montgomery_one, (digit_t*)&R->Z); + fpzero((digit_t*)(R->Z.e)[1]); + + /* Main loop */ + for (i = 0; i < nbits; i++) { + int bit = (m[i >> S2N_SIKE_P434_R3_LOG2RADIX] >> (i & (S2N_SIKE_P434_R3_RADIX-1))) & 1; + swap = bit ^ prevbit; + prevbit = bit; + mask = 0 - (digit_t)swap; + + swap_points(R, R2, mask); + xDBLADD(R0, R2, &R->X, A24); + fp2mul_mont(&R2->X, &R->Z, &R2->X); + } + swap = 0 ^ prevbit; + mask = 0 - (digit_t)swap; + swap_points(R, R2, mask); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h new file mode 100644 index 0000000000..44245ec726 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_ec_isogeny.h @@ -0,0 +1,46 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ + +#pragma once + +#include "sikep434r3.h" + +#define xDBL S2N_SIKE_P434_R3_NAMESPACE(xDBL) +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24); + +#define xDBLe S2N_SIKE_P434_R3_NAMESPACE(xDBLe) +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24plus, const f2elm_t *C24, const int e); + +#define get_4_isog S2N_SIKE_P434_R3_NAMESPACE(get_4_isog) +void get_4_isog(const point_proj_t P, f2elm_t *A24plus, f2elm_t *C24, f2elm_t *coeff); + +#define eval_4_isog S2N_SIKE_P434_R3_NAMESPACE(eval_4_isog) +void eval_4_isog(point_proj_t P, f2elm_t* coeff); + +#define xTPL S2N_SIKE_P434_R3_NAMESPACE(xTPL) +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus); + +#define xTPLe S2N_SIKE_P434_R3_NAMESPACE(xTPLe) +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t *A24minus, const f2elm_t *A24plus, const int e); + +#define get_3_isog S2N_SIKE_P434_R3_NAMESPACE(get_3_isog) +void get_3_isog(const point_proj_t P, f2elm_t *A24minus, f2elm_t *A24plus, f2elm_t *coeff); + +#define eval_3_isog S2N_SIKE_P434_R3_NAMESPACE(eval_3_isog) +void eval_3_isog(point_proj_t Q, const f2elm_t *coeff); + +#define inv_3_way S2N_SIKE_P434_R3_NAMESPACE(inv_3_way) +void inv_3_way(f2elm_t *z1, f2elm_t *z2, f2elm_t *z3); + +#define get_A S2N_SIKE_P434_R3_NAMESPACE(get_A) +void get_A(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xR, f2elm_t *A); + +#define j_inv S2N_SIKE_P434_R3_NAMESPACE(j_inv) +void j_inv(const f2elm_t *A, const f2elm_t *C, f2elm_t *jinv); + +#define LADDER3PT S2N_SIKE_P434_R3_NAMESPACE(LADDER3PT) +void LADDER3PT(const f2elm_t *xP, const f2elm_t *xQ, const f2elm_t *xPQ, const digit_t *m, + const unsigned int AliceOrBob, point_proj_t R, const f2elm_t *A); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c new file mode 100644 index 0000000000..413cb2b8e4 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.c @@ -0,0 +1,417 @@ +/******************************************************************************************** +* SHA3-derived function SHAKE +* +* Based on the public domain implementation in crypto_hash/keccakc512/simple/ +* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer +* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 +* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe +* +* See NIST Special Publication 800-185 for more information: +* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf +* +*********************************************************************************************/ + +#include <stdint.h> +#include <stddef.h> +#include "sikep434r3.h" +#include "sikep434r3_fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + +static const uint64_t KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL, +}; + +static void KeccakF1600_StatePermute(uint64_t * state) +{ + int round; + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + + /* copyFromState(A, state) */ + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < NROUNDS; round += 2 ) { + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + /* prepareTheta */ + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */ + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + /* prepareTheta */ + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */ + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + /* copyToState(state, A) */ + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, + unsigned char p) +{ + unsigned long long i; + unsigned char t[200]; + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(m + 8 * i); + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) { + t[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t[i] = m[i]; + } + + t[i] = p; + t[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) { + s[i] ^= load64(t + 8 * i); + } +} + +static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r) +{ + unsigned int i; + + while(nblocks > 0) { + KeccakF1600_StatePermute(s); + for (i = 0; i < (r>>3); i++) { + store64(h+8*i, s[i]); + } + + h += r; + nblocks--; + } +} + +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned long long nblocks = outlen / SHAKE256_RATE; + size_t i; + + for (i = 0; i < 25; ++i) { + s[i] = 0; + } + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); + + output += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + + for (i = 0; i < outlen; i++) { + output[i] = t[i]; + } + } +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h new file mode 100644 index 0000000000..9dd237a491 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fips202.h @@ -0,0 +1,23 @@ +/******************************************************************************************** +* SHA3-derived function SHAKE +* +* Based on the public domain implementation in crypto_hash/keccakc512/simple/ +* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer +* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 +* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe +* +* See NIST Special Publication 800-185 for more information: +* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf +* +*********************************************************************************************/ + +#pragma once + +#include <stdint.h> +#include "sikep434r3.h" + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 + +#define shake256 S2N_SIKE_P434_R3_NAMESPACE(shake256) +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c new file mode 100644 index 0000000000..867ac0f6c1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.c @@ -0,0 +1,297 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: modular arithmetic for P434 +*********************************************************************************************/ + +#include "sikep434r3.h" +#include "pq-crypto/s2n_pq.h" +#include "sikep434r3_fp.h" +#include "sikep434r3_fpx.h" +#include "sikep434r3_fp_x64_asm.h" + +/* Multiprecision subtraction with correction with 2*p, c = a-b+2p. */ +void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + mp_sub434_p2_asm(a, b, c); + return; + } +#endif + + unsigned int i, borrow = 0; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]); + } + + borrow = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i], borrow, c[i]); + } +} + +/* Multiprecision subtraction with correction with 4*p, c = a-b+4p. */ +void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + mp_sub434_p4_asm(a, b, c); + return; + } +#endif + + unsigned int i, borrow = 0; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]); + } + + borrow = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x4)[i], borrow, c[i]); + } +} + +/* Modular addition, c = a+b mod p434. + * Inputs: a, b in [0, 2*p434-1] + * Output: c in [0, 2*p434-1] */ +void fpadd434(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + fpadd434_asm(a, b, c); + return; + } +#endif + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(carry, c[i], ((const digit_t*)p434x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(carry, c[i], ((const digit_t*)p434x2)[i] & mask, carry, c[i]); + } +} + +/* Modular subtraction, c = a-b mod p434. + * Inputs: a, b in [0, 2*p434-1] + * Output: c in [0, 2*p434-1] */ +void fpsub434(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + fpsub434_asm(a, b, c); + return; + } +#endif + + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(borrow, c[i], ((const digit_t*)p434x2)[i] & mask, borrow, c[i]); + } +} + +/* Modular negation, a = -a mod p434. + * Input/output: a in [0, 2*p434-1] */ +void fpneg434(digit_t* a) +{ + unsigned int i, borrow = 0; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, ((const digit_t*)p434x2)[i], a[i], borrow, a[i]); + } +} + +/* Modular division by two, c = a/2 mod p434. + * Input : a in [0, 2*p434-1] + * Output: c in [0, 2*p434-1] */ +void fpdiv2_434(const digit_t* a, digit_t* c) +{ + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); /* If a is odd compute a+p434 */ + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(carry, a[i], ((const digit_t*)p434)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, S2N_SIKE_P434_R3_NWORDS_FIELD); +} + +/* Modular correction to reduce field element a in [0, 2*p434-1] to [0, p434-1]. */ +void fpcorrection434(digit_t* a) +{ + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, a[i], ((const digit_t*)p434)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + S2N_SIKE_P434_R3_ADDC(borrow, a[i], ((const digit_t*)p434)[i] & mask, borrow, a[i]); + } +} + +/* Digit multiplication, digit * digit -> 2-digit result */ +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) +{ + register digit_t al, ah, bl, bh, temp; + digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; + digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); + + al = a & mask_low; /* Low part */ + ah = a >> (sizeof(digit_t) * 4); /* High part */ + bl = b & mask_low; + bh = b >> (sizeof(digit_t) * 4); + + albl = al*bl; + albh = al*bh; + ahbl = ah*bl; + ahbh = ah*bh; + c[0] = albl & mask_low; /* C00 */ + + res1 = albl >> (sizeof(digit_t) * 4); + res2 = ahbl & mask_low; + res3 = albh & mask_low; + temp = res1 + res2 + res3; + carry = temp >> (sizeof(digit_t) * 4); + c[0] ^= temp << (sizeof(digit_t) * 4); /* C01 */ + + res1 = ahbl >> (sizeof(digit_t) * 4); + res2 = albh >> (sizeof(digit_t) * 4); + res3 = ahbh & mask_low; + temp = res1 + res2 + res3 + carry; + c[1] = temp & mask_low; /* C10 */ + carry = temp & mask_high; + c[1] ^= (ahbh & mask_high) + carry; /* C11 */ +} + +/* Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. */ +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + S2N_SIKE_P434_R3_UNREFERENCED_PARAMETER(nwords); + mul434_asm(a, b, c); + return; + } +#endif + + unsigned int i, j; + digit_t t = 0, u = 0, v = 0, UV[2]; + unsigned int carry; + + for (i = 0; i < nwords; i++) { + for (j = 0; j <= i; j++) { + S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]); + S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v); + S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = nwords; i < 2*nwords-1; i++) { + for (j = i-nwords+1; j < nwords; j++) { + S2N_SIKE_P434_R3_MUL(a[j], b[i-j], UV+1, UV[0]); + S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v); + S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + c[2*nwords-1] = v; +} + +/* Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. + * mc = ma*R^-1 mod p434x2, where R = 2^448. + * If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. + * ma is assumed to be in Montgomery representation. */ +void rdc_mont(digit_t* ma, digit_t* mc) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + rdc434_asm(ma, mc); + return; + } +#endif + + unsigned int i, j, carry, count = S2N_SIKE_P434_R3_ZERO_WORDS; + digit_t UV[2], t = 0, u = 0, v = 0; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + mc[i] = 0; + } + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + for (j = 0; j < i; j++) { + if (j < (i-S2N_SIKE_P434_R3_ZERO_WORDS+1)) { + S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]); + S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v); + S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v); + S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u); + t += carry; + mc[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = S2N_SIKE_P434_R3_NWORDS_FIELD; i < 2*S2N_SIKE_P434_R3_NWORDS_FIELD-1; i++) { + if (count > 0) { + count -= 1; + } + for (j = i-S2N_SIKE_P434_R3_NWORDS_FIELD+1; j < S2N_SIKE_P434_R3_NWORDS_FIELD; j++) { + if (j < (S2N_SIKE_P434_R3_NWORDS_FIELD-count)) { + S2N_SIKE_P434_R3_MUL(mc[j], ((const digit_t*)p434p1)[i-j], UV+1, UV[0]); + S2N_SIKE_P434_R3_ADDC(0, UV[0], v, carry, v); + S2N_SIKE_P434_R3_ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + S2N_SIKE_P434_R3_ADDC(0, v, ma[i], carry, v); + S2N_SIKE_P434_R3_ADDC(carry, u, 0, carry, u); + t += carry; + mc[i-S2N_SIKE_P434_R3_NWORDS_FIELD] = v; + v = u; + u = t; + t = 0; + } + + /* `carry` isn't read after this, but it's still a necessary argument to the macro */ + /* cppcheck-suppress unreadVariable */ + S2N_SIKE_P434_R3_ADDC(0, v, ma[2*S2N_SIKE_P434_R3_NWORDS_FIELD-1], carry, v); + mc[S2N_SIKE_P434_R3_NWORDS_FIELD-1] = v; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h new file mode 100644 index 0000000000..7844ba0457 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp.h @@ -0,0 +1,39 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: modular arithmetic for P434 +*********************************************************************************************/ + +#pragma once + +#include "sikep434r3.h" + +#define mp_sub434_p2 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2) +void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp_sub434_p4 S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4) +void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c); + +#define fpadd434 S2N_SIKE_P434_R3_NAMESPACE(fpadd434) +void fpadd434(const digit_t* a, const digit_t* b, digit_t* c); + +#define fpsub434 S2N_SIKE_P434_R3_NAMESPACE(fpsub434) +void fpsub434(const digit_t* a, const digit_t* b, digit_t* c); + +#define fpneg434 S2N_SIKE_P434_R3_NAMESPACE(fpneg434) +void fpneg434(digit_t* a); + +#define fpdiv2_434 S2N_SIKE_P434_R3_NAMESPACE(fpdiv2_434) +void fpdiv2_434(const digit_t* a, digit_t* c); + +#define fpcorrection434 S2N_SIKE_P434_R3_NAMESPACE(fpcorrection434) +void fpcorrection434(digit_t* a); + +#define digit_x_digit S2N_SIKE_P434_R3_NAMESPACE(digit_x_digit) +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); + +#define mp_mul S2N_SIKE_P434_R3_NAMESPACE(mp_mul) +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +#define rdc_mont S2N_SIKE_P434_R3_NAMESPACE(rdc_mont) +void rdc_mont(digit_t* ma, digit_t* mc); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S new file mode 100644 index 0000000000..1814a8b25a --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.S @@ -0,0 +1,1054 @@ +//******************************************************************************************* +// Supersingular Isogeny Key Encapsulation Library +// +// Abstract: field arithmetic in x64 assembly for P434 on Linux +//******************************************************************************************* + +/* Requires bmi2 instruction set for mulx. adx instructions are optional, but preferred. */ +.intel_syntax noprefix + +#define S2N_SIKE_P434_R3_NAMESPACE(s) s2n_sike_p434_r3_##s + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + +// Define addition instructions +#ifdef S2N_ADX + +#define ADD1 adox +#define ADC1 adox +#define ADD2 adcx +#define ADC2 adcx + +#else + +#define ADD1 add +#define ADC1 adc +#define ADD2 add +#define ADC2 adc + +#endif + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text + +#define asm_p434 S2N_SIKE_P434_R3_NAMESPACE(asm_p434) +.align 32 +.type asm_p434, @object +.size asm_p434, 56 +asm_p434: +.quad -1 +.quad -1 +.quad -1 +.quad -161717841442111489 +.quad 8918917783347572387 +.quad 7853257225132122198 +.quad 620258357900100 + + +#define asm_p434x2 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x2) +.align 32 +.type asm_p434x2, @object +.size asm_p434x2, 56 +asm_p434x2: +.quad -2 +.quad -1 +.quad -1 +.quad -323435682884222977 +.quad -608908507014406841 +.quad -2740229623445307220 +.quad 1240516715800200 + + +#define asm_p434x4 S2N_SIKE_P434_R3_NAMESPACE(asm_p434x4) +.align 32 +.type asm_p434x4, @object +.size asm_p434x4, 56 +asm_p434x4: +.quad -4 +.quad -1 +.quad -1 +.quad -646871365768445953 +.quad -1217817014028813681 +.quad -5480459246890614439 +.quad 2481033431600401 + + +#define asm_p434p1 S2N_SIKE_P434_R3_NAMESPACE(asm_p434p1) +.align 32 +.type asm_p434p1, @object +.size asm_p434p1, 56 +asm_p434p1: +.quad 0 +.quad 0 +.quad 0 +.quad -161717841442111488 +.quad 8918917783347572387 +.quad 7853257225132122198 +.quad 620258357900100 + +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm) +.global fpadd434_asm +fpadd434_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + push rbp + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + + mov rbx, [rip+asm_p434x2] + sub r8, rbx + mov rcx, [rip+asm_p434x2+8] + sbb r9, rcx + sbb r10, rcx + mov rdi, [rip+asm_p434x2+24] + sbb r11, rdi + mov rsi, [rip+asm_p434x2+32] + sbb r12, rsi + mov rbp, [rip+asm_p434x2+40] + sbb r13, rbp + mov r15, [rip+asm_p434x2+48] + sbb r14, r15 + sbb rax, 0 + + and rbx, rax + and rcx, rax + and rdi, rax + and rsi, rax + and rbp, rax + and r15, rax + + add r8, rbx + adc r9, rcx + adc r10, rcx + adc r11, rdi + adc r12, rsi + adc r13, rbp + adc r14, r15 + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm) +.global fpsub434_asm +fpsub434_asm: + push r12 + push r13 + push r14 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb rax, 0 + + mov rcx, [rip+asm_p434x2] + mov rdi, [rip+asm_p434x2+8] + mov rsi, [rip+asm_p434x2+24] + and rcx, rax + and rdi, rax + and rsi, rax + add r8, rcx + adc r9, rdi + adc r10, rdi + adc r11, rsi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + mov r8, [rip+asm_p434x2+32] + mov rdi, [rip+asm_p434x2+40] + mov rsi, [rip+asm_p434x2+48] + and r8, rax + and rdi, rax + and rsi, rax + bt rcx, 0 + adc r12, r8 + adc r13, rdi + adc r14, rsi + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + + pop r14 + pop r13 + pop r12 + ret + +///////////////////////////////////////////////////////////////// MACRO +.macro SUB434_PX P0 + push r12 + push r13 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov rcx, [reg_p1+48] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb rcx, [reg_p2+48] + + mov rax, [rip+\P0] + mov rdi, [rip+\P0+8] + mov rsi, [rip+\P0+24] + add r8, rax + mov rax, [rip+\P0+32] + adc r9, rdi + adc r10, rdi + adc r11, rsi + mov rdi, [rip+\P0+40] + mov rsi, [rip+\P0+48] + adc r12, rax + adc r13, rdi + adc rcx, rsi + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], rcx + + pop r13 + pop r12 +.endm + +//*********************************************************************** +// Multiprecision subtraction with correction with 2*p434 +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434 +//*********************************************************************** +#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm) +.global mp_sub434_p2_asm +mp_sub434_p2_asm: + SUB434_PX asm_p434x2 + ret + +//*********************************************************************** +// Multiprecision subtraction with correction with 4*p434 +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434 +//*********************************************************************** +#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm) +.global mp_sub434_p4_asm +mp_sub434_p4_asm: + SUB434_PX asm_p434x4 + ret + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: memory pointer C and regs T1, T3, rax +// Temps: regs T0:T6 +///////////////////////////////////////////////////////////////// +#ifdef S2N_ADX + +.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + xor rax, rax + adox \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adox \T1, \T3 + + mov rdx, 8\M0 + mulx \T3, \T4, \M1 // T3:T4 = A1*B0 + adox \T2, rax + xor rax, rax + mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 + adox \T4, \T0 + mov 8\C, \T4 // C1_final + adcx \T3, \T6 + mulx \T6, \T0, 16\M1 // T6:T0 = A1*B2 + adox \T3, \T1 + adcx \T5, \T0 + adcx \T6, rax + adox \T5, \T2 + + mov rdx, 16\M0 + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + adox \T6, rax + xor rax, rax + mulx \T4, \T2, 8\M1 // T4:T2 = A2*B1 + adox \T0, \T3 + mov 16\C, \T0 // C2_final + adcx \T1, \T5 + mulx \T0, \T3, 16\M1 // T0:T3 = A2*B2 + adcx \T4, \T6 + adcx \T0, rax + adox \T1, \T2 + adox \T3, \T4 + adox rax, \T0 +.endm + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: memory pointer C +// Temps: regs T0:T9 +///////////////////////////////////////////////////////////////// +.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + xor rax, rax + adox \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adox \T1, \T3 + mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 + adox \T2, \T4 + + mov rdx, 8\M0 + mulx \T5, \T4, \M1 // T5:T4 = A1*B0 + adox \T3, rax + xor rax, rax + mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 + adox \T4, \T0 + mov 8\C, \T4 // C1_final + adcx \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 + adcx \T6, \T8 + adox \T5, \T1 + mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 + adcx \T7, \T9 + adcx \T8, rax + adox \T6, \T2 + + mov rdx, 16\M0 + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + adox \T7, \T3 + adox \T8, rax + xor rax, rax + mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 + adox \T0, \T5 + mov 16\C, \T0 // C2_final + adcx \T1, \T3 + mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 + adcx \T2, \T4 + adox \T1, \T6 + mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 + adcx \T3, \T9 + mov rdx, 24\M0 + adcx \T4, rax + + adox \T2, \T7 + adox \T3, \T8 + adox \T4, rax + + mulx \T5, \T0, \M1 // T5:T0 = A3*B0 + xor rax, rax + mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 + adcx \T5, \T7 + adox \T1, \T0 + mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 + adcx \T6, \T8 + adox \T2, \T5 + mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 + adcx \T7, \T9 + adcx \T8, rax + + adox \T3, \T6 + adox \T4, \T7 + adox \T8, rax + mov 24\C, \T1 // C3_final + mov 32\C, \T2 // C4_final + mov 40\C, \T3 // C5_final + mov 48\C, \T4 // C6_final + mov 56\C, \T8 // C7_final +.endm + +#else // S2N_ADX + +.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + add \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adc \T1, \T3 + + mov rdx, 8\M0 + mulx \T3, \T4, \M1 // T3:T4 = A1*B0 + adc \T2, 0 + mulx \T5, \T6, 8\M1 // T5:T6 = A1*B1 + add \T4, \T0 + mov 8\C, \T4 // C1_final + adc \T3, \T1 + adc \T5, \T2 + mulx \T2, \T1, 16\M1 // T2:T1 = A1*B2 + adc \T2, 0 + + add \T3, \T6 + adc \T5, \T1 + adc \T2, 0 + + mov rdx, 16\M0 + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + add \T0, \T3 + mov 16\C, \T0 // C2_final + mulx \T4, \T6, 8\M1 // T4:T6 = A2*B1 + adc \T1, \T5 + adc \T2, \T4 + mulx rax, \T3, 16\M1 // rax:T3 = A2*B2 + adc rax, 0 + add \T1, \T6 + adc \T3, \T2 + adc rax, 0 +.endm + +.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + add \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adc \T1, \T3 + mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 + adc \T2, \T4 + mov rdx, 8\M0 + adc \T3, 0 + + mulx \T5, \T4, \M1 // T5:T4 = A1*B0 + mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 + add \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 + adc \T6, \T8 + mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 + adc \T7, \T9 + adc \T8, 0 + + add \T4, \T0 + mov 8\C, \T4 // C1_final + adc \T5, \T1 + adc \T6, \T2 + adc \T7, \T3 + mov rdx, 16\M0 + adc \T8, 0 + + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 + add \T1, \T3 + mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 + adc \T2, \T4 + mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 + adc \T3, \T9 + mov rdx, 24\M0 + adc \T4, 0 + + add \T0, \T5 + mov 16\C, \T0 // C2_final + adc \T1, \T6 + adc \T2, \T7 + adc \T3, \T8 + adc \T4, 0 + + mulx \T5, \T0, \M1 // T5:T0 = A3*B0 + mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 + add \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 + adc \T6, \T8 + mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 + adc \T7, \T9 + adc \T8, 0 + + add \T1, \T0 + mov 24\C, \T1 // C3_final + adc \T2, \T5 + mov 32\C, \T2 // C4_final + adc \T3, \T6 + mov 40\C, \T3 // C5_final + adc \T4, \T7 + mov 48\C, \T4 // C6_final + adc \T8, 0 + mov 56\C, \T8 // C7_final +.endm + +#endif // S2N_ADX + +//***************************************************************************** +// 434-bit multiplication using Karatsuba (one level), schoolbook (one level) +//***************************************************************************** +#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm) +.global mul434_asm +mul434_asm: + push r12 + push r13 + push r14 + push r15 + mov rcx, reg_p3 + + // r8-r11 <- AH + AL, rax <- mask + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + push rbx + push rbp + sub rsp, 96 + add r8, [reg_p1+32] + adc r9, [reg_p1+40] + adc r10, [reg_p1+48] + adc r11, 0 + sbb rax, 0 + mov [rsp], r8 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + + // r12-r15 <- BH + BL, rbx <- mask + xor rbx, rbx + mov r12, [reg_p2] + mov r13, [reg_p2+8] + mov r14, [reg_p2+16] + mov r15, [reg_p2+24] + add r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, 0 + sbb rbx, 0 + mov [rsp+32], r12 + mov [rsp+40], r13 + mov [rsp+48], r14 + mov [rsp+56], r15 + + // r12-r15 <- masked (BH + BL) + and r12, rax + and r13, rax + and r14, rax + and r15, rax + + // r8-r11 <- masked (AH + AL) + and r8, rbx + and r9, rbx + and r10, rbx + and r11, rbx + + // r8-r11 <- masked (AH + AL) + masked (AH + AL) + add r8, r12 + adc r9, r13 + adc r10, r14 + adc r11, r15 + mov [rsp+64], r8 + mov [rsp+72], r9 + mov [rsp+80], r10 + mov [rsp+88], r11 + + // [rsp] <- (AH+AL) x (BH+BL), low part + MUL256_SCHOOL [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + // [rcx] <- AL x BL + MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 + + // [rcx+64], rbx, rbp, rax <- AH x BH + MUL192_SCHOOL [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14 + + // r8-r11 <- (AH+AL) x (BH+BL), final step + mov r8, [rsp+64] + mov r9, [rsp+72] + mov r10, [rsp+80] + mov r11, [rsp+88] + mov rdx, [rsp+32] + add r8, rdx + mov rdx, [rsp+40] + adc r9, rdx + mov rdx, [rsp+48] + adc r10, rdx + mov rdx, [rsp+56] + adc r11, rdx + + // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL + mov r12, [rsp] + mov r13, [rsp+8] + mov r14, [rsp+16] + mov r15, [rsp+24] + sub r12, [rcx] + sbb r13, [rcx+8] + sbb r14, [rcx+16] + sbb r15, [rcx+24] + sbb r8, [rcx+32] + sbb r9, [rcx+40] + sbb r10, [rcx+48] + sbb r11, [rcx+56] + + // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + sub r12, [rcx+64] + sbb r13, [rcx+72] + sbb r14, [rcx+80] + sbb r15, rbx + sbb r8, rbp + sbb r9, rax + sbb r10, 0 + sbb r11, 0 + + add r12, [rcx+32] + mov [rcx+32], r12 // Result C4-C7 + adc r13, [rcx+40] + mov [rcx+40], r13 + adc r14, [rcx+48] + mov [rcx+48], r14 + adc r15, [rcx+56] + mov [rcx+56], r15 + adc r8, [rcx+64] + mov [rcx+64], r8 // Result C8-C15 + adc r9, [rcx+72] + mov [rcx+72], r9 + adc r10, [rcx+80] + mov [rcx+80], r10 + adc r11, rbx + mov [rcx+88], r11 + adc rbp, 0 + mov [rcx+96], rbp + adc rax, 0 + mov [rcx+104], rax + + add rsp, 96 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: reg I0 and memory pointer M1 +// Outputs: regs T0:T4 +// Temps: regs T0:T5 +///////////////////////////////////////////////////////////////// +.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 + mulx \T2, \T4, 8\M1 + xor rax, rax + mulx \T3, \T5, 16\M1 + ADD1 \T1, \T4 // T1 <- C1_final + ADC1 \T2, \T5 // T2 <- C2_final + mulx \T4, \T5, 24\M1 + ADC1 \T3, \T5 // T3 <- C3_final + ADC1 \T4, rax // T4 <- C4_final +.endm + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: regs I0 and I1, and memory pointer M1 +// Outputs: regs T0:T5 +// Temps: regs T0:T5 +///////////////////////////////////////////////////////////////// +#ifdef S2N_ADX + +.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 + mulx \T2, \T4, 8\M1 + xor rax, rax + mulx \T3, \T5, 16\M1 + ADD1 \T1, \T4 + ADC1 \T2, \T5 + mulx \T4, \T5, 24\M1 + ADC1 \T3, \T5 + ADC1 \T4, rax + + xor rax, rax + mov rdx, \I1 + mulx \I1, \T5, \M1 + ADD2 \T1, \T5 // T1 <- C1_final + ADC2 \T2, \I1 + mulx \T5, \I1, 8\M1 + ADC2 \T3, \T5 + ADD1 \T2, \I1 + mulx \T5, \I1, 16\M1 + ADC2 \T4, \T5 + ADC1 \T3, \I1 + mulx \T5, \I1, 24\M1 + ADC2 \T5, rax + ADC1 \T4, \I1 + ADC1 \T5, rax +.endm + +#else // S2N_ADX + +.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 + mulx \T2, \T4, 8\M1 + mulx \T3, \T5, 16\M1 + add \T1, \T4 + adc \T2, \T5 + mulx \T4, \T5, 24\M1 + adc \T3, \T5 + adc \T4, 0 + + mov rdx, \I1 + mulx \I1, \T5, \M1 + add \T1, \T5 // T1 <- C1_final + adc \T2, \I1 + mulx \T5, \I1, 8\M1 + adc \T3, \T5 + mulx \T5, rax, 16\M1 + adc \T4, \T5 + mulx \T5, rdx, 24\M1 + adc \T5, 0 + add \T2, \I1 + adc \T3, rax + adc \T4, rdx + adc \T5, 0 +.endm + +#endif // S2N_ADX + +//************************************************************************************** +// Montgomery reduction +// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +// Operation: c [reg_p2] = a [reg_p1] +//************************************************************************************** +#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm) +.global rdc434_asm +rdc434_asm: + push r14 + + // a[0-1] x p434p1_nz --> result: r8:r13 + mov rdx, [reg_p1] + mov r14, [reg_p1+8] + mulx r9, r8, [rip+asm_p434p1+24] // result r8 + push r12 + push r13 + push r15 + push rbp + push rbx + MUL128x256_SCHOOL rdx, r14, [rip+asm_p434p1+24], r8, r9, r10, r11, r12, r13 + + mov rdx, [reg_p1+16] + mov rcx, [reg_p1+72] + add r8, [reg_p1+24] + adc r9, [reg_p1+32] + adc r10, [reg_p1+40] + adc r11, [reg_p1+48] + adc r12, [reg_p1+56] + adc r13, [reg_p1+64] + adc rcx, 0 + mulx rbp, rbx, [rip+asm_p434p1+24] // result rbx + mov [reg_p2], r9 + mov [reg_p2+8], r10 + mov [reg_p2+16], r11 + mov [reg_p2+24], r12 + mov [reg_p2+32], r13 + mov r9, [reg_p1+80] + mov r10, [reg_p1+88] + mov r11, [reg_p1+96] + mov rdi, [reg_p1+104] + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc rdi, 0 + + // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15 + MUL128x256_SCHOOL rdx, r8, [rip+asm_p434p1+24], rbx, rbp, r12, r13, r14, r15 + + mov rdx, [reg_p2] + add rbx, [reg_p2+8] + adc rbp, [reg_p2+16] + adc r12, [reg_p2+24] + adc r13, [reg_p2+32] + adc r14, rcx + mov rcx, 0 + adc r15, r9 + adc rcx, r10 + mulx r9, r8, [rip+asm_p434p1+24] // result r8 + mov [reg_p2], rbp + mov [reg_p2+8], r12 + mov [reg_p2+16], r13 + adc r11, 0 + adc rdi, 0 + + // a[4-5] x p434p1_nz --> result: r8:r13 + MUL128x256_SCHOOL rdx, rbx, [rip+asm_p434p1+24], r8, r9, r10, rbp, r12, r13 + + mov rdx, [reg_p2] + add r8, [reg_p2+8] + adc r9, [reg_p2+16] + adc r10, r14 + adc rbp, r15 + adc r12, rcx + adc r13, r11 + adc rdi, 0 + mulx r15, r14, [rip+asm_p434p1+24] // result r14 + mov [reg_p2], r8 // Final result c0-c1 + mov [reg_p2+8], r9 + + // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11 + MUL64x256_SCHOOL rdx, [rip+asm_p434p1+24], r14, r15, r8, r9, r11, rcx + + // Final result c2:c6 + add r14, r10 + adc r15, rbp + pop rbx + pop rbp + adc r8, r12 + adc r9, r13 + adc r11, rdi + mov [reg_p2+16], r14 + mov [reg_p2+24], r15 + pop r15 + pop r13 + mov [reg_p2+32], r8 + mov [reg_p2+40], r9 + mov [reg_p2+48], r11 + + pop r12 + pop r14 + ret + +//*********************************************************************** +// 434-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm) +.global mp_add434_asm +mp_add434_asm: + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + adc r8, [reg_p2+32] + adc r9, [reg_p2+40] + adc r10, [reg_p2+48] + mov [reg_p3+32], r8 + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + ret + +//*************************************************************************** +// 2x434-bit multiprecision subtraction/addition +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448 +//*************************************************************************** +#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm) +.global mp_subadd434x2_asm +mp_subadd434x2_asm: + push r12 + push r13 + push r14 + push r15 + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + + mov r8, [reg_p1+40] + mov r9, [reg_p1+48] + mov r10, [reg_p1+56] + mov r11, [reg_p1+64] + mov r12, [reg_p1+72] + sbb r8, [reg_p2+40] + sbb r9, [reg_p2+48] + sbb r10, [reg_p2+56] + sbb r11, [reg_p2+64] + sbb r12, [reg_p2+72] + mov [reg_p3+40], r8 + mov [reg_p3+48], r9 + mov [reg_p3+56], r10 + + mov r13, [reg_p1+80] + mov r14, [reg_p1+88] + mov r15, [reg_p1+96] + mov rcx, [reg_p1+104] + sbb r13, [reg_p2+80] + sbb r14, [reg_p2+88] + sbb r15, [reg_p2+96] + sbb rcx, [reg_p2+104] + sbb rax, 0 + + // Add p434 anded with the mask in rax + mov r8, [rip+asm_p434] + mov r9, [rip+asm_p434+24] + mov r10, [rip+asm_p434+32] + mov rdi, [rip+asm_p434+40] + mov rsi, [rip+asm_p434+48] + and r8, rax + and r9, rax + and r10, rax + and rdi, rax + and rsi, rax + mov rax, [reg_p3+56] + add rax, r8 + adc r11, r8 + adc r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, rdi + adc rcx, rsi + + mov [reg_p3+56], rax + mov [reg_p3+64], r11 + mov [reg_p3+72], r12 + mov [reg_p3+80], r13 + mov [reg_p3+88], r14 + mov [reg_p3+96], r15 + mov [reg_p3+104], rcx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +//*********************************************************************** +// Double 2x434-bit multiprecision subtraction +// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] +//*********************************************************************** +#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm) +.global mp_dblsub434x2_asm +mp_dblsub434x2_asm: + push r12 + push r13 + push r14 + + mov r8, [reg_p3] + mov r9, [reg_p3+8] + mov r10, [reg_p3+16] + mov r11, [reg_p3+24] + mov r12, [reg_p3+32] + mov r13, [reg_p3+40] + mov r14, [reg_p3+48] + sub r8, [reg_p1] + sbb r9, [reg_p1+8] + sbb r10, [reg_p1+16] + sbb r11, [reg_p1+24] + sbb r12, [reg_p1+32] + sbb r13, [reg_p1+40] + sbb r14, [reg_p1+48] + setc al + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + setc cl + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + + mov r8, [reg_p3+56] + mov r9, [reg_p3+64] + mov r10, [reg_p3+72] + mov r11, [reg_p3+80] + mov r12, [reg_p3+88] + mov r13, [reg_p3+96] + mov r14, [reg_p3+104] + bt rax, 0 + sbb r8, [reg_p1+56] + sbb r9, [reg_p1+64] + sbb r10, [reg_p1+72] + sbb r11, [reg_p1+80] + sbb r12, [reg_p1+88] + sbb r13, [reg_p1+96] + sbb r14, [reg_p1+104] + bt rcx, 0 + sbb r8, [reg_p2+56] + sbb r9, [reg_p2+64] + sbb r10, [reg_p2+72] + sbb r11, [reg_p2+80] + sbb r12, [reg_p2+88] + sbb r13, [reg_p2+96] + sbb r14, [reg_p2+104] + mov [reg_p3+56], r8 + mov [reg_p3+64], r9 + mov [reg_p3+72], r10 + mov [reg_p3+80], r11 + mov [reg_p3+88], r12 + mov [reg_p3+96], r13 + mov [reg_p3+104], r14 + + pop r14 + pop r13 + pop r12 + ret diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h new file mode 100644 index 0000000000..1753e25fb4 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fp_x64_asm.h @@ -0,0 +1,38 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: x86_64 assembly optimized modular arithmetic for P434 +*********************************************************************************************/ + +#pragma once + +#if defined(S2N_SIKE_P434_R3_ASM) + +#define fpadd434_asm S2N_SIKE_P434_R3_NAMESPACE(fpadd434_asm) +void fpadd434_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define fpsub434_asm S2N_SIKE_P434_R3_NAMESPACE(fpsub434_asm) +void fpsub434_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define mul434_asm S2N_SIKE_P434_R3_NAMESPACE(mul434_asm) +void mul434_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define rdc434_asm S2N_SIKE_P434_R3_NAMESPACE(rdc434_asm) +void rdc434_asm(digit_t* ma, digit_t* mc); + +#define mp_add434_asm S2N_SIKE_P434_R3_NAMESPACE(mp_add434_asm) +void mp_add434_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp_subadd434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_subadd434x2_asm) +void mp_subadd434x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp_dblsub434x2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_dblsub434x2_asm) +void mp_dblsub434x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp_sub434_p2_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p2_asm) +void mp_sub434_p2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp_sub434_p4_asm S2N_SIKE_P434_R3_NAMESPACE(mp_sub434_p4_asm) +void mp_sub434_p4_asm(const digit_t* a, const digit_t* b, digit_t* c); + +#endif diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c new file mode 100644 index 0000000000..40c61144e4 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.c @@ -0,0 +1,478 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ + +#include <string.h> +#include "sikep434r3.h" +#include "sikep434r3_fp.h" +#include "sikep434r3_fpx.h" +#include "pq-crypto/s2n_pq.h" +#include "sikep434r3_fp_x64_asm.h" + +static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc); +static void to_mont(const felm_t a, felm_t mc); +static void from_mont(const felm_t ma, felm_t c); +static void fpsqr_mont(const felm_t ma, felm_t mc); +static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); +static void fpinv_chain_mont(felm_t a); +static void fpinv_mont(felm_t a); +static void to_fp2mont(const f2elm_t *a, f2elm_t *mc); +static void from_fp2mont(const f2elm_t *ma, f2elm_t *c); + +/* Encoding digits to bytes according to endianness */ +__inline static void encode_to_bytes(const digit_t* x, unsigned char* enc, int nbytes) +{ + if (is_big_endian()) { + int ndigits = nbytes / sizeof(digit_t); + int rem = nbytes % sizeof(digit_t); + + for (int i = 0; i < ndigits; i++) { + digit_t temp = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[i]); + memcpy(enc + (i * sizeof(digit_t)), (unsigned char *)&temp, sizeof(digit_t)); + } + + if (rem) { + digit_t ld = S2N_SIKE_P434_R3_BSWAP_DIGIT(x[ndigits]); + memcpy(enc + ndigits * sizeof(digit_t), (unsigned char *) &ld, rem); + } + } else { + memcpy(enc, (const unsigned char *) x, nbytes); + } +} + +/* Conversion of GF(p^2) element from Montgomery to standard representation, + * and encoding by removing leading 0 bytes */ +void fp2_encode(const f2elm_t *x, unsigned char *enc) +{ + f2elm_t t; + + from_fp2mont(x, &t); + encode_to_bytes(t.e[0], enc, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2); + encode_to_bytes(t.e[1], enc + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2); +} + +/* Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation */ +void fp2_decode(const unsigned char *x, f2elm_t *dec) +{ + decode_to_digits(x, dec->e[0], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD); + decode_to_digits(x + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, dec->e[1], S2N_SIKE_P434_R3_FP2_ENCODED_BYTES / 2, S2N_SIKE_P434_R3_NWORDS_FIELD); + to_fp2mont(dec, dec); +} + +/* Multiprecision multiplication, c = a*b mod p. */ +static void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) +{ + dfelm_t temp = {0}; + + mp_mul(ma, mb, temp, S2N_SIKE_P434_R3_NWORDS_FIELD); + rdc_mont(temp, mc); +} + +/* Conversion to Montgomery representation, + * mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. + * The Montgomery constant R^2 mod p is the global value "Montgomery_R2". */ +static void to_mont(const felm_t a, felm_t mc) +{ + fpmul_mont(a, (const digit_t*)&Montgomery_R2, mc); +} + +/* Conversion from Montgomery representation to standard representation, + * c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. */ +static void from_mont(const felm_t ma, felm_t c) +{ + digit_t one[S2N_SIKE_P434_R3_NWORDS_FIELD] = {0}; + + one[0] = 1; + fpmul_mont(ma, one, c); + fpcorrection434(c); +} + +/* Copy wordsize digits, c = a, where lng(a) = nwords. */ +void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords) +{ + unsigned int i; + + for (i = 0; i < nwords; i++) { + c[i] = a[i]; + } +} + +/* Multiprecision squaring, c = a^2 mod p. */ +static void fpsqr_mont(const felm_t ma, felm_t mc) +{ + dfelm_t temp = {0}; + + mp_mul(ma, ma, temp, S2N_SIKE_P434_R3_NWORDS_FIELD); + rdc_mont(temp, mc); +} + +/* Copy a GF(p^2) element, c = a. */ +void fp2copy(const f2elm_t *a, f2elm_t *c) +{ + fpcopy(a->e[0], c->e[0]); + fpcopy(a->e[1], c->e[1]); +} + +/* GF(p^2) division by two, c = a/2 in GF(p^2). */ +void fp2div2(const f2elm_t *a, f2elm_t *c) +{ + fpdiv2_434(a->e[0], c->e[0]); + fpdiv2_434(a->e[1], c->e[1]); +} + +/* Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. */ +unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ + unsigned int i, carry = 0; + + for (i = 0; i < nwords; i++) { + S2N_SIKE_P434_R3_ADDC(carry, a[i], b[i], carry, c[i]); + } + + return carry; +} + +/* GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). + * Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] + * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] */ +void fp2sqr_mont(const f2elm_t *a, f2elm_t *c) +{ + felm_t t1, t2, t3; + + mp_addfast(a->e[0], a->e[1], t1); /* t1 = a0+a1 */ + mp_sub434_p4(a->e[0], a->e[1], t2); /* t2 = a0-a1 */ + mp_addfast(a->e[0], a->e[0], t3); /* t3 = 2a0 */ + fpmul_mont(t1, t2, c->e[0]); /* c0 = (a0+a1)(a0-a1) */ + fpmul_mont(t3, a->e[1], c->e[1]); /* c1 = 2a0*a1 */ +} + +/* Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. */ +static unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ + unsigned int i, borrow = 0; + + for (i = 0; i < nwords; i++) { + S2N_SIKE_P434_R3_SUBC(borrow, a[i], b[i], borrow, c[i]); + } + + return borrow; +} + +/* Multiprecision subtraction followed by addition with p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD, + * c = a-b+(p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. */ +__inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + mp_subadd434x2_asm(a, b, c); + return; + } +#endif + + felm_t t1; + + digit_t mask = 0 - (digit_t)mp_sub(a, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD); + for (int i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + t1[i] = ((const digit_t *) p434)[i] & mask; + } + mp_addfast((digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD], t1, (digit_t*)&c[S2N_SIKE_P434_R3_NWORDS_FIELD]); +} + +/* Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*S2N_SIKE_P434_R3_NWORDS_FIELD. */ +__inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + mp_dblsub434x2_asm(a, b, c); + return; + } +#endif + + mp_sub(c, a, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD); + mp_sub(c, b, c, 2*S2N_SIKE_P434_R3_NWORDS_FIELD); +} + +/* GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). + * Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] + * Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] */ +void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) +{ + felm_t t1, t2; + dfelm_t tt1, tt2, tt3; + + mp_addfast(a->e[0], a->e[1], t1); /* t1 = a0+a1 */ + mp_addfast(b->e[0], b->e[1], t2); /* t2 = b0+b1 */ + mp_mul(a->e[0], b->e[0], tt1, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt1 = a0*b0 */ + mp_mul(a->e[1], b->e[1], tt2, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt2 = a1*b1 */ + mp_mul(t1, t2, tt3, S2N_SIKE_P434_R3_NWORDS_FIELD); /* tt3 = (a0+a1)*(b0+b1) */ + mp_dblsubfast(tt1, tt2, tt3); /* tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */ + mp_subaddfast(tt1, tt2, tt1); /* tt1 = a0*b0 - a1*b1 + p*2^S2N_SIKE_P434_R3_MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1 */ + rdc_mont(tt3, c->e[1]); /* c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 */ + rdc_mont(tt1, c->e[0]); /* c[0] = a0*b0 - a1*b1 */ +} + +/* Chain to compute a^(p-3)/4 using Montgomery arithmetic. */ +static void fpinv_chain_mont(felm_t a) +{ + unsigned int i, j; + felm_t t[31], tt; + + /* Precomputed table */ + fpsqr_mont(a, tt); + fpmul_mont(a, tt, t[0]); + for (i = 0; i <= 29; i++) { + fpmul_mont(t[i], tt, t[i + 1]); + } + + fpcopy(a, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 10; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[3], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[7], tt, tt); + for (i = 0; i < 8; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 8; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[30], tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[21], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[19], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[16], tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 8; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[25], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(a, tt, tt); + for (i = 0; i < 7; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[28], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 6; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 9; i++) { + fpsqr_mont(tt, tt); + } + fpmul_mont(t[22], tt, tt); + for (j = 0; j < 35; j++) { + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[30], tt, tt); + } + fpcopy(tt, a); +} + +/* Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. */ +static void fpinv_mont(felm_t a) +{ + felm_t tt; + + fpcopy(a, tt); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, a); +} + +/* GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). */ +void fp2inv_mont(f2elm_t *a) +{ + f2elm_t t1; + + fpsqr_mont(a->e[0], t1.e[0]); /* t10 = a0^2 */ + fpsqr_mont(a->e[1], t1.e[1]); /* t11 = a1^2 */ + fpadd434(t1.e[0], t1.e[1], t1.e[0]); /* t10 = a0^2+a1^2 */ + fpinv_mont(t1.e[0]); /* t10 = (a0^2+a1^2)^-1 */ + fpneg434(a->e[1]); /* a = a0-i*a1 */ + fpmul_mont(a->e[0], t1.e[0], a->e[0]); + fpmul_mont(a->e[1], t1.e[0], a->e[1]); /* a = (a0-i*a1)*(a0^2+a1^2)^-1 */ +} + +/* Conversion of a GF(p^2) element to Montgomery representation, + * mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). */ +static void to_fp2mont(const f2elm_t *a, f2elm_t *mc) +{ + to_mont(a->e[0], mc->e[0]); + to_mont(a->e[1], mc->e[1]); +} + +/* Conversion of a GF(p^2) element from Montgomery representation to standard representation, + * c_i = ma_i*R^(-1) = a_i in GF(p^2). */ +static void from_fp2mont(const f2elm_t *ma, f2elm_t *c) +{ + from_mont(ma->e[0], c->e[0]); + from_mont(ma->e[1], c->e[1]); +} + +/* Multiprecision right shift by one. */ +void mp_shiftr1(digit_t* x, const unsigned int nwords) +{ + unsigned int i; + + for (i = 0; i < nwords-1; i++) { + S2N_SIKE_P434_R3_SHIFTR(x[i+1], x[i], 1, x[i], S2N_SIKE_P434_R3_RADIX); + } + x[nwords-1] >>= 1; +} + +void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits) +{ + dec[ndigits - 1] = 0; + memcpy((unsigned char*)dec, x, nbytes); + + if (is_big_endian()) { + for (int i = 0; i < ndigits; i++) { + dec[i] = S2N_SIKE_P434_R3_BSWAP_DIGIT(dec[i]); + } + } +} + +void fpcopy(const felm_t a, felm_t c) +{ + unsigned int i; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + c[i] = a[i]; + } +} + +void fpzero(felm_t a) +{ + unsigned int i; + + for (i = 0; i < S2N_SIKE_P434_R3_NWORDS_FIELD; i++) { + a[i] = 0; + } +} + +void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) +{ + fpadd434(a->e[0], b->e[0], c->e[0]); + fpadd434(a->e[1], b->e[1], c->e[1]); +} + +void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) +{ + fpsub434(a->e[0], b->e[0], c->e[0]); + fpsub434(a->e[1], b->e[1], c->e[1]); +} + +void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c) +{ +#if defined(S2N_SIKE_P434_R3_ASM) + if (s2n_sikep434r3_asm_is_enabled()) { + mp_add434_asm(a, b, c); + return; + } +#endif + + mp_add(a, b, c, S2N_SIKE_P434_R3_NWORDS_FIELD); +} + +void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) +{ + mp_addfast(a->e[0], b->e[0], c->e[0]); + mp_addfast(a->e[1], b->e[1], c->e[1]); +} + +void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c) +{ + mp_sub434_p2(a->e[0], b->e[0], c->e[0]); + mp_sub434_p2(a->e[1], b->e[1], c->e[1]); +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h new file mode 100644 index 0000000000..bce1849ce1 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_fpx.h @@ -0,0 +1,65 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ + +#pragma once + +#include <string.h> +#include "sikep434r3.h" +#include "sikep434r3_fp.h" + +#define fp2_encode S2N_SIKE_P434_R3_NAMESPACE(fp2_encode) +void fp2_encode(const f2elm_t *x, unsigned char *enc); + +#define fp2_decode S2N_SIKE_P434_R3_NAMESPACE(fp2_decode) +void fp2_decode(const unsigned char *x, f2elm_t *dec); + +#define copy_words S2N_SIKE_P434_R3_NAMESPACE(copy_words) +void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); + +#define fp2copy S2N_SIKE_P434_R3_NAMESPACE(fp2copy) +void fp2copy(const f2elm_t *a, f2elm_t *c); + +#define fp2div2 S2N_SIKE_P434_R3_NAMESPACE(fp2div2) +void fp2div2(const f2elm_t *a, f2elm_t *c); + +#define mp_add S2N_SIKE_P434_R3_NAMESPACE(mp_add) +unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +#define fp2sqr_mont S2N_SIKE_P434_R3_NAMESPACE(fp2sqr_mont) +void fp2sqr_mont(const f2elm_t *a, f2elm_t *c); + +#define fp2mul_mont S2N_SIKE_P434_R3_NAMESPACE(fp2mul_mont) +void fp2mul_mont(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); + +#define fp2inv_mont S2N_SIKE_P434_R3_NAMESPACE(fp2inv_mont) +void fp2inv_mont(f2elm_t *a); + +#define mp_shiftr1 S2N_SIKE_P434_R3_NAMESPACE(mp_shiftr1) +void mp_shiftr1(digit_t* x, const unsigned int nwords); + +#define decode_to_digits S2N_SIKE_P434_R3_NAMESPACE(decode_to_digits) +void decode_to_digits(const unsigned char* x, digit_t* dec, int nbytes, int ndigits); + +#define fpcopy S2N_SIKE_P434_R3_NAMESPACE(fpcopy) +void fpcopy(const felm_t a, felm_t c); + +#define fpzero S2N_SIKE_P434_R3_NAMESPACE(fpzero) +void fpzero(felm_t a); + +#define fp2add S2N_SIKE_P434_R3_NAMESPACE(fp2add) +void fp2add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); + +#define fp2sub S2N_SIKE_P434_R3_NAMESPACE(fp2sub) +void fp2sub(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); + +#define mp_addfast S2N_SIKE_P434_R3_NAMESPACE(mp_addfast) +void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c); + +#define mp2_add S2N_SIKE_P434_R3_NAMESPACE(mp2_add) +void mp2_add(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); + +#define mp2_sub_p2 S2N_SIKE_P434_R3_NAMESPACE(mp2_sub_p2) +void mp2_sub_p2(const f2elm_t *a, const f2elm_t *b, f2elm_t *c); diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c new file mode 100644 index 0000000000..b32add7723 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_kem.c @@ -0,0 +1,112 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: supersingular isogeny key encapsulation (SIKE) protocol +*********************************************************************************************/ + +#include <string.h> +#include "sikep434r3.h" +#include "sikep434r3_fips202.h" +#include "utils/s2n_safety.h" +#include "tls/s2n_kem.h" +#include "pq-crypto/s2n_pq.h" +#include "pq-crypto/s2n_pq_random.h" +#include "sikep434r3_fpx.h" +#include "sikep434r3_api.h" + +/* SIKE's key generation + * Outputs: secret key sk (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) + * public key pk (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) */ +int s2n_sike_p434_r3_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + + /* Generate lower portion of secret key sk <- s||SK */ + POSIX_GUARD_RESULT(s2n_get_random_bytes(sk, S2N_SIKE_P434_R3_MSG_BYTES)); + POSIX_GUARD(random_mod_order_B(sk + S2N_SIKE_P434_R3_MSG_BYTES)); + + /* Generate public key pk */ + EphemeralKeyGeneration_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, pk); + + /* Append public key pk to secret key sk */ + memcpy(&sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES); + + return S2N_SUCCESS; +} + +/* SIKE's encapsulation + * Input: public key pk (S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) + * Outputs: shared secret ss (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes) + * ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes) */ +int s2n_sike_p434_r3_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + + unsigned char ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES]; + unsigned char jinvariant[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES]; + unsigned char h[S2N_SIKE_P434_R3_MSG_BYTES]; + unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES]; + + /* Generate ephemeralsk <- G(m||pk) mod oA */ + POSIX_GUARD_RESULT(s2n_get_random_bytes(temp, S2N_SIKE_P434_R3_MSG_BYTES)); + memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], pk, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES); + shake256(ephemeralsk, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES); + ephemeralsk[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE; + + /* Encrypt */ + EphemeralKeyGeneration_A(ephemeralsk, ct); + EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant); + shake256(h, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) { + ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] = temp[i] ^ h[i]; + } + + /* Generate shared secret ss <- H(m||ct) */ + memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES); + shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES); + + return S2N_SUCCESS; +} + +/* SIKE's decapsulation + * Input: secret key sk (S2N_SIKE_P434_R3_SECRET_KEY_BYTES = S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES bytes) + * ciphertext message ct (S2N_SIKE_P434_R3_CIPHERTEXT_BYTES = S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES + S2N_SIKE_P434_R3_MSG_BYTES bytes) + * Outputs: shared secret ss (S2N_SIKE_P434_R3_SHARED_SECRET_BYTES bytes) */ +int s2n_sike_p434_r3_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) +{ + POSIX_ENSURE(s2n_pq_is_enabled(), S2N_ERR_PQ_DISABLED); + + unsigned char ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES]; + unsigned char jinvariant_[S2N_SIKE_P434_R3_FP2_ENCODED_BYTES]; + unsigned char h_[S2N_SIKE_P434_R3_MSG_BYTES]; + unsigned char c0_[S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES]; + unsigned char temp[S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES]; + + /* Decrypt */ + EphemeralSecretAgreement_B(sk + S2N_SIKE_P434_R3_MSG_BYTES, ct, jinvariant_); + shake256(h_, S2N_SIKE_P434_R3_MSG_BYTES, jinvariant_, S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + for (int i = 0; i < S2N_SIKE_P434_R3_MSG_BYTES; i++) { + temp[i] = ct[i + S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES] ^ h_[i]; + } + + /* Generate ephemeralsk_ <- G(m||pk) mod oA */ + memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], &sk[S2N_SIKE_P434_R3_MSG_BYTES + S2N_SIKE_P434_R3_SECRETKEY_B_BYTES], S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES); + shake256(ephemeralsk_, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, temp, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES+S2N_SIKE_P434_R3_MSG_BYTES); + ephemeralsk_[S2N_SIKE_P434_R3_SECRETKEY_A_BYTES - 1] &= S2N_SIKE_P434_R3_MASK_ALICE; + + /* Generate shared secret ss <- H(m||ct), or output ss <- H(s||ct) in case of ct verification failure */ + EphemeralKeyGeneration_A(ephemeralsk_, c0_); + + /* Verify ciphertext. + * If c0_ and ct are NOT equal, decaps failed and we overwrite the shared secret + * with pseudorandom noise (ss = H(s||ct)) by performing the copy (dont_copy = false). + * + * If c0_ and ct are equal, then decaps succeeded and we skip the overwrite and output + * the actual shared secret: ss = H(m||ct) (dont_copy = true). */ + bool dont_copy = s2n_constant_time_equals(c0_, ct, S2N_SIKE_P434_R3_PUBLIC_KEY_BYTES); + POSIX_GUARD(s2n_constant_time_copy_or_dont(temp, sk, S2N_SIKE_P434_R3_MSG_BYTES, dont_copy)); + memcpy(&temp[S2N_SIKE_P434_R3_MSG_BYTES], ct, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES); + shake256(ss, S2N_SIKE_P434_R3_SHARED_SECRET_BYTES, temp, S2N_SIKE_P434_R3_CIPHERTEXT_BYTES+S2N_SIKE_P434_R3_MSG_BYTES); + + return S2N_SUCCESS; +} diff --git a/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c new file mode 100644 index 0000000000..f570e27e32 --- /dev/null +++ b/contrib/restricted/aws/s2n/pq-crypto/sike_r3/sikep434r3_sidh.c @@ -0,0 +1,310 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) +*********************************************************************************************/ + +#include "sikep434r3.h" +#include "pq-crypto/s2n_pq_random.h" +#include "utils/s2n_safety.h" +#include "sikep434r3_fpx.h" +#include "sikep434r3_ec_isogeny.h" +#include "sikep434r3_api.h" + +/* Initialization of basis points */ +static void init_basis(const digit_t *gen, f2elm_t *XP, f2elm_t *XQ, f2elm_t *XR) +{ + fpcopy(gen, XP->e[0]); + fpcopy(gen + S2N_SIKE_P434_R3_NWORDS_FIELD, XP->e[1]); + fpcopy(gen + 2*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[0]); + fpcopy(gen + 3*S2N_SIKE_P434_R3_NWORDS_FIELD, XQ->e[1]); + fpcopy(gen + 4*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[0]); + fpcopy(gen + 5*S2N_SIKE_P434_R3_NWORDS_FIELD, XR->e[1]); +} + +/* Generation of Bob's secret key + * Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] */ +int random_mod_order_B(unsigned char* random_digits) +{ + POSIX_GUARD_RESULT(s2n_get_random_bytes(random_digits, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES)); + random_digits[S2N_SIKE_P434_R3_SECRETKEY_B_BYTES-1] &= S2N_SIKE_P434_R3_MASK_BOB; /* Masking last byte */ + + return 0; +} + +/* Alice's ephemeral public key generation + * Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. + * Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded + * by removing leading 0 bytes. */ +int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA) +{ + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE]; + f2elm_t _XPA, _XQA, _XRA, coeff[3], _A24plus = {0}, _C24 = {0}, _A = {0}; + f2elm_t *XPA=&_XPA, *XQA=&_XQA, *XRA=&_XRA, *A24plus=&_A24plus, *C24=&_C24, *A=&_A; + unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0}; + + /* Initialize basis points */ + init_basis((const digit_t*)A_gen, XPA, XQA, XRA); + init_basis((const digit_t*)B_gen, &phiP->X, &phiQ->X, &phiR->X); + fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]); + fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]); + fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]); + + /* Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 */ + fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]); + mp2_add(A24plus, A24plus, A24plus); + mp2_add(A24plus, A24plus, C24); + mp2_add(A24plus, C24, A); + mp2_add(C24, C24, A24plus); + + /* Retrieve kernel point */ + decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER); + LADDER3PT(XPA, XQA, XRA, SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A); + + /* Traverse tree */ + tree_index = 0; + for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) { + while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) { + fp2copy(&R->X, &pts[npts]->X); + fp2copy(&R->Z, &pts[npts]->Z); + pts_index[npts++] = tree_index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2*m)); + tree_index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + fp2copy(&pts[npts-1]->X, &R->X); + fp2copy(&pts[npts-1]->Z, &R->Z); + tree_index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z); + fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X); + fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X); + fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X); + + /* Format public key */ + fp2_encode(&phiP->X, PublicKeyA); + fp2_encode(&phiQ->X, PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + fp2_encode(&phiR->X, PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + + return 0; +} + +/* Bob's ephemeral public key generation + * Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. + * Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded + * by removing leading 0 bytes. */ +int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB) +{ + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB]; + f2elm_t _XPB, _XQB, _XRB, coeff[3], _A24plus = {0}, _A24minus = {0}, _A = {0}; + f2elm_t *XPB=&_XPB, *XQB=&_XQB, *XRB=&_XRB, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A; + + unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0; + digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0}; + + /* Initialize basis points */ + init_basis((const digit_t*)B_gen, XPB, XQB, XRB); + init_basis((const digit_t*)A_gen, &phiP->X, &phiQ->X, &phiR->X); + fpcopy((const digit_t*)&Montgomery_one, (phiP->Z.e)[0]); + fpcopy((const digit_t*)&Montgomery_one, (phiQ->Z.e)[0]); + fpcopy((const digit_t*)&Montgomery_one, (phiR->Z.e)[0]); + + /* Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 */ + fpcopy((const digit_t*)&Montgomery_one, A24plus->e[0]); + mp2_add(A24plus, A24plus, A24plus); + mp2_add(A24plus, A24plus, A24minus); + mp2_add(A24plus, A24minus, A); + mp2_add(A24minus, A24minus, A24plus); + + /* Retrieve kernel point */ + decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER); + LADDER3PT(XPB, XQB, XRB, SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A); + + /* Traverse tree */ + tree_index = 0; + for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) { + while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) { + fp2copy(&R->X, &pts[npts]->X); + fp2copy(&R->Z, &pts[npts]->Z); + pts_index[npts++] = tree_index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + tree_index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + fp2copy(&pts[npts-1]->X, &R->X); + fp2copy(&pts[npts-1]->Z, &R->Z); + tree_index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + inv_3_way(&phiP->Z, &phiQ->Z, &phiR->Z); + fp2mul_mont(&phiP->X, &phiP->Z, &phiP->X); + fp2mul_mont(&phiQ->X, &phiQ->Z, &phiQ->X); + fp2mul_mont(&phiR->X, &phiR->Z, &phiR->X); + + /* Format public key */ + fp2_encode(&phiP->X, PublicKeyB); + fp2_encode(&phiQ->X, PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + fp2_encode(&phiR->X, PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES); + + return 0; +} + +/* Alice's ephemeral shared secret computation + * It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB + * Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. + * Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + * Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded + * by removing leading 0 bytes. */ +int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, + unsigned char* SharedSecretA) +{ + point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE]; + f2elm_t coeff[3], PKB[3], _jinv; + f2elm_t _A24plus = {0}, _C24 = {0}, _A = {0}; + f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *C24=&_C24, *A=&_A; + unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + digit_t SecretKeyA[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0}; + + /* Initialize images of Bob's basis */ + fp2_decode(PublicKeyB, &PKB[0]); + fp2_decode(PublicKeyB + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]); + fp2_decode(PublicKeyB + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]); + + /* Initialize constants: A24plus = A+2C, C24 = 4C, where C=1 */ + get_A(&PKB[0], &PKB[1], &PKB[2], A); + mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD); + mp2_add(A, C24, A24plus); + mp_add(C24->e[0], C24->e[0], C24->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD); + + /* Retrieve kernel point */ + decode_to_digits(PrivateKeyA, SecretKeyA, S2N_SIKE_P434_R3_SECRETKEY_A_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER); + LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyA, S2N_SIKE_P434_R3_ALICE, R, A); + + /* Traverse tree */ + tree_index = 0; + for (row = 1; row < S2N_SIKE_P434_R3_MAX_ALICE; row++) { + while (tree_index < S2N_SIKE_P434_R3_MAX_ALICE-row) { + fp2copy(&R->X, &pts[npts]->X); + fp2copy(&R->Z, &pts[npts]->Z); + pts_index[npts++] = tree_index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2*m)); + tree_index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + + fp2copy(&pts[npts-1]->X, &R->X); + fp2copy(&pts[npts-1]->Z, &R->Z); + tree_index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + mp2_add(A24plus, A24plus, A24plus); + fp2sub(A24plus, C24, A24plus); + fp2add(A24plus, A24plus, A24plus); + j_inv(A24plus, C24, jinv); + fp2_encode(jinv, SharedSecretA); /* Format shared secret */ + + return 0; +} + +/* Bob's ephemeral shared secret computation + * It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA + * Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. + * Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + * Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded + * by removing leading 0 bytes. */ +int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, + unsigned char* SharedSecretB) +{ + point_proj_t R, pts[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB]; + f2elm_t coeff[3], PKB[3], _jinv; + f2elm_t _A24plus = {0}, _A24minus = {0}, _A = {0}; + f2elm_t *jinv=&_jinv, *A24plus=&_A24plus, *A24minus=&_A24minus, *A=&_A; + unsigned int i, row, m, tree_index = 0, pts_index[S2N_SIKE_P434_R3_MAX_INT_POINTS_BOB], npts = 0, ii = 0; + digit_t SecretKeyB[S2N_SIKE_P434_R3_NWORDS_ORDER] = {0}; + + /* Initialize images of Alice's basis */ + fp2_decode(PublicKeyA, &PKB[0]); + fp2_decode(PublicKeyA + S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[1]); + fp2_decode(PublicKeyA + 2*S2N_SIKE_P434_R3_FP2_ENCODED_BYTES, &PKB[2]); + + /* Initialize constants: A24plus = A+2C, A24minus = A-2C, where C=1 */ + get_A(&PKB[0], &PKB[1], &PKB[2], A); + mp_add((const digit_t*)&Montgomery_one, (const digit_t*)&Montgomery_one, A24minus->e[0], S2N_SIKE_P434_R3_NWORDS_FIELD); + mp2_add(A, A24minus, A24plus); + mp2_sub_p2(A, A24minus, A24minus); + + /* Retrieve kernel point */ + decode_to_digits(PrivateKeyB, SecretKeyB, S2N_SIKE_P434_R3_SECRETKEY_B_BYTES, S2N_SIKE_P434_R3_NWORDS_ORDER); + LADDER3PT(&PKB[0], &PKB[1], &PKB[2], SecretKeyB, S2N_SIKE_P434_R3_BOB, R, A); + + /* Traverse tree */ + tree_index = 0; + for (row = 1; row < S2N_SIKE_P434_R3_MAX_BOB; row++) { + while (tree_index < S2N_SIKE_P434_R3_MAX_BOB-row) { + fp2copy(&R->X, &pts[npts]->X); + fp2copy(&R->Z, &pts[npts]->Z); + pts_index[npts++] = tree_index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + tree_index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + + fp2copy(&pts[npts-1]->X, &R->X); + fp2copy(&pts[npts-1]->Z, &R->Z); + tree_index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + fp2add(A24plus, A24minus, A); + fp2add(A, A, A); + fp2sub(A24plus, A24minus, A24plus); + j_inv(A, A24plus, jinv); + fp2_encode(jinv, SharedSecretB); /* Format shared secret */ + + return 0; +} |