// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 // Input x[6]; output z[6] // // extern void bignum_triple_p384 // (uint64_t z[static 6], uint64_t x[static 6]); // // The input x can be any 6-digit bignum, not necessarily reduced modulo p_384, // and the result is always fully reduced, i.e. z = (3 * x) mod p_384. // // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p384) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p384) S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p384_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p384_alt) .text .balign 4 #define z x0 #define x x1 #define d0 x2 #define d1 x3 #define d2 x4 #define d3 x5 #define d4 x6 #define d5 x7 #define h x8 // Slightly offset aliases for the d_i for readability. #define a0 x3 #define a1 x4 #define a2 x5 #define a3 x6 #define a4 x7 #define a5 x8 // More aliases for the same thing at different stages #define q x8 #define c x8 // Other temporary variables #define t0 x9 #define t1 x10 S2N_BN_SYMBOL(bignum_triple_p384): S2N_BN_SYMBOL(bignum_triple_p384_alt): // Load the inputs ldp a0, a1, [x] ldp a2, a3, [x, #16] ldp a4, a5, [x, #32] // First do the multiplication by 3, getting z = [h; d5; ...; d0] lsl d0, a0, #1 adds d0, d0, a0 extr d1, a1, a0, #63 adcs d1, d1, a1 extr d2, a2, a1, #63 adcs d2, d2, a2 extr d3, a3, a2, #63 adcs d3, d3, a3 extr d4, a4, a3, #63 adcs d4, d4, a4 extr d5, a5, a4, #63 adcs d5, d5, a5 lsr h, a5, #63 adc h, h, xzr // For this limited range a simple quotient estimate of q = h + 1 works, where // h = floor(z / 2^384). Then -p_384 <= z - q * p_384 < p_384, so we just need // to subtract q * p_384 and then if that's negative, add back p_384. add q, h, #1 // Initial subtraction of z - q * p_384, with bitmask c for the carry // Actually done as an addition of (z - 2^384 * h) + q * (2^384 - p_384) // which, because q = h + 1, is exactly 2^384 + (z - q * p_384), and // therefore CF <=> 2^384 + (z - q * p_384) >= 2^384 <=> z >= q * p_384. lsl t1, q, #32 subs t0, q, t1 sbc t1, t1, xzr adds d0, d0, t0 adcs d1, d1, t1 adcs d2, d2, q adcs d3, d3, xzr adcs d4, d4, xzr adcs d5, d5, xzr csetm c, cc // Use the bitmask c for final masked addition of p_384. mov t0, #0x00000000ffffffff and t0, t0, c adds d0, d0, t0 eor t0, t0, c adcs d1, d1, t0 mov t0, #0xfffffffffffffffe and t0, t0, c adcs d2, d2, t0 adcs d3, d3, c adcs d4, d4, c adc d5, d5, c // Store the result stp d0, d1, [z] stp d2, d3, [z, #16] stp d4, d5, [z, #32] ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif