// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming // x reduced // Inputs c, x[6]; output z[6] // // extern void bignum_cmul_p384 // (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]); // // Standard x86-64 ABI: RDI = z, RSI = c, RDX = x // Microsoft x64 ABI: RCX = z, RDX = c, R8 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p384) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p384) .text #define z %rdi // Temporarily moved here for initial multiply #define x %rcx // Likewise this is thrown away after initial multiply #define m %rdx #define a %rax #define c %rcx #define d0 %rsi #define d1 %r8 #define d2 %r9 #define d3 %r10 #define d4 %r11 #define d5 %r12 // Multiplier again for second stage #define q %rdx #define ashort %eax #define cshort %ecx #define qshort %edx S2N_BN_SYMBOL(bignum_cmul_p384): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif // We seem to need (just!) one extra register, which we need to save and restore pushq %r12 // Shuffle inputs (since we want multiplier in %rdx) movq %rdx, x movq %rsi, m // Multiply, accumulating the result as 2^384 * h + [d5;d4;d3;d2;d1;d0] // but actually immediately producing q = h + 1, our quotient approximation, // by adding 1 to it. Note that by hypothesis x is reduced mod p_384, so our // product is <= (2^64 - 1) * (p_384 - 1) and hence h <= 2^64 - 2, meaning // there is no danger this addition of 1 could wrap. mulxq (x), d0, d1 mulxq 8(x), a, d2 addq a, d1 mulxq 16(x), a, d3 adcq a, d2 mulxq 24(x), a, d4 adcq a, d3 mulxq 32(x), a, d5 adcq a, d4 mulxq 40(x), a, q adcq a, d5 adcq $1, q // It's easy to see -p_384 <= z - q * p_384 < p_384, so we just need to // subtract q * p_384 and then correct if that is negative by adding p_384. // // Write p_384 = 2^384 - r where r = 2^128 + 2^96 - 2^32 + 1 // // We want z - q * (2^384 - r) // = (2^384 * h + l) - q * (2^384 - r) // = 2^384 * (h - q) + (l + q * r) // = 2^384 * (-1) + (l + q * r) xorq c, c movq $0xffffffff00000001, a mulxq a, a, c adcxq a, d0 adoxq c, d1 movl $0x00000000ffffffff, ashort mulxq a, a, c adcxq a, d1 adoxq c, d2 adcxq q, d2 movl $0, ashort movl $0, cshort adoxq a, a adcq a, d3 adcq c, d4 adcq c, d5 adcq c, c subq $1, c // The net c value is now the top word of the 7-word answer, hence will // be -1 if we need a corrective addition, 0 otherwise, usable as a mask. // Now use that mask for a masked addition of p_384, which again is in // fact done by a masked subtraction of 2^384 - p_384, so that we only // have three nonzero digits and so can avoid using another register. movl $0x00000000ffffffff, qshort xorq a, a andq c, q subq q, a andq $1, c subq a, d0 movq d0, (z) sbbq q, d1 movq d1, 8(z) sbbq c, d2 movq d2, 16(z) sbbq $0, d3 movq d3, 24(z) sbbq $0, d4 movq d4, 32(z) sbbq $0, d5 movq d5, 40(z) // Return popq %r12 #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif