// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming // x reduced // Inputs c, x[6]; output z[6] // // extern void bignum_cmul_p384_alt // (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]); // // Standard x86-64 ABI: RDI = z, RSI = c, RDX = x // Microsoft x64 ABI: RCX = z, RDX = c, R8 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p384_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p384_alt) .text #define z %rdi // Temporarily moved here for initial multiply #define x %rcx // Likewise this is thrown away after initial multiply #define m %rsi #define a %rax #define c %rcx #define d %rdx #define d0 %r8 #define d1 %r9 #define d2 %r10 #define d3 %r11 #define d4 %r12 #define d5 %rsi // Multiplier again for second stage #define q %rcx #define ashort %eax #define dshort %edx #define cshort %ecx #define qshort %ecx S2N_BN_SYMBOL(bignum_cmul_p384_alt): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif // We seem to need (just!) one extra register, which we need to save and restore pushq %r12 // Shuffle inputs (since we want %rdx for the high parts of products) movq %rdx, x // Multiply, accumulating the result as 2^384 * h + [d5;d4;d3;d2;d1;d0] // but actually immediately producing q = h + 1, our quotient approximation, // by adding 1 to it. Note that by hypothesis x is reduced mod p_384, so our // product is <= (2^64 - 1) * (p_384 - 1) and hence h <= 2^64 - 2, meaning // there is no danger this addition of 1 could wrap. movq (x), a mulq m movq a, d0 movq d, d1 movq 8(x), a mulq m xorq d2, d2 addq a, d1 adcq d, d2 movq 16(x), a mulq m xorq d3, d3 addq a, d2 adcq d, d3 movq 24(x), a mulq m xorq d4, d4 addq a, d3 adcq d, d4 movq 32(x), a mulq m addq a, d4 adcq $0, d movq m, a movq d, d5 mulq 40(x) movl $1, qshort addq a, d5 adcq d, q // It's easy to see -p_384 <= z - q * p_384 < p_384, so we just need to // subtract q * p_384 and then correct if that is negative by adding p_384. // // Write p_384 = 2^384 - r where r = 2^128 + 2^96 - 2^32 + 1 // // We want z - q * (2^384 - r) // = (2^384 * h + l) - q * (2^384 - r) // = 2^384 * (h - q) + (l + q * r) // = 2^384 * (-1) + (l + q * r) movq $0xffffffff00000001, a mulq q addq a, d0 adcq d, d1 adcq q, d2 movq q, a sbbq c, c movl $0x00000000ffffffff, dshort negq c mulq d addq a, d1 adcq d, d2 adcq c, d3 adcq $0, d4 adcq $0, d5 sbbq c, c notq c // The net c value is now the top word of the 7-word answer, hence will // be -1 if we need a corrective addition, 0 otherwise, usable as a mask. // Now use that mask for a masked addition of p_384, which again is in // fact done by a masked subtraction of 2^384 - p_384, so that we only // have three nonzero digits and so can avoid using another register. movl $0x00000000ffffffff, dshort xorq a, a andq c, d subq d, a andq $1, c subq a, d0 movq d0, (z) sbbq d, d1 movq d1, 8(z) sbbq c, d2 movq d2, 16(z) sbbq $0, d3 movq d3, 24(z) sbbq $0, d4 movq d4, 32(z) sbbq $0, d5 movq d5, 40(z) // Return popq %r12 #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif