// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 // Input x[6]; output z[6] // // extern void bignum_triple_p384 // (uint64_t z[static 6], uint64_t x[static 6]); // // The input x can be any 6-digit bignum, not necessarily reduced modulo p_384, // and the result is always fully reduced, i.e. z = (3 * x) mod p_384. // // Standard x86-64 ABI: RDI = z, RSI = x // Microsoft x64 ABI: RCX = z, RDX = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p384) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p384) .text #define z %rdi #define x %rsi #define d0 %r8 #define d1 %r9 #define d2 %r10 #define d3 %r11 #define d4 %rbx #define d5 %rsi #define a %rax #define c %rcx #define q %rdx #define ashort %eax #define qshort %edx S2N_BN_SYMBOL(bignum_triple_p384): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif // We seem to need (just!) one extra register, which we need to save and restore pushq %rbx // Multiply, accumulating the result as 2^384 * h + [d5;d4;d3;d2;d1;d0] // but actually immediately producing q = h + 1, our quotient approximation, // by adding 1 to it. xorl ashort, ashort movq (x), q movq q, d0 adcxq q, q adoxq q, d0 movq 8(x), q movq q, d1 adcxq q, q adoxq q, d1 movq 16(x), q movq q, d2 adcxq q, q adoxq q, d2 movq 24(x), q movq q, d3 adcxq q, q adoxq q, d3 movq 32(x), q movq q, d4 adcxq q, q adoxq q, d4 movq 40(x), q movq q, d5 adcxq q, q adoxq q, d5 movl $1, qshort adcxq a, q adoxq a, q // Initial subtraction of z - q * p_384, with bitmask c for the carry // Actually done as an addition of (z - 2^384 * h) + q * (2^384 - p_384) // which, because q = h + 1, is exactly 2^384 + (z - q * p_384), and // therefore CF <=> 2^384 + (z - q * p_384) >= 2^384 <=> z >= q * p_384. movq q, c shlq $32, c movq q, a subq c, a sbbq $0, c addq a, d0 adcq c, d1 adcq q, d2 adcq $0, d3 adcq $0, d4 adcq $0, d5 sbbq c, c notq c // Now use that mask for a masked addition of p_384, which again is in // fact done by a masked subtraction of 2^384 - p_384, so that we only // have three nonzero digits and so can avoid using another register. movl $0x00000000ffffffff, qshort xorl ashort, ashort andq c, q subq q, a negq c subq a, d0 movq d0, (z) sbbq q, d1 movq d1, 8(z) sbbq c, d2 movq d2, 16(z) sbbq $0, d3 movq d3, 24(z) sbbq $0, d4 movq d4, 32(z) sbbq $0, d5 movq d5, 40(z) // Return popq %rbx #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif