// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming // x reduced // Inputs c, x[9]; output z[9] // // extern void bignum_cmul_p521 // (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]); // // Standard x86-64 ABI: RDI = z, RSI = c, RDX = x // Microsoft x64 ABI: RCX = z, RDX = c, R8 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p521) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p521) .text #define z %rdi // Temporarily moved here for initial multiply #define x %rcx // Likewise this is thrown away after initial multiply #define c %rdx #define cshort %edx #define a %rax #define dd %rax // Digits: last one aliased to the local x pointer that's no longer needed #define d0 %rsi #define d1 %r8 #define d2 %r9 #define d3 %r10 #define d4 %r11 #define d5 %rbx #define d6 %rbp #define d7 %r12 #define d8 %r13 #define d9 %rcx // Same as d9 #define h d9 S2N_BN_SYMBOL(bignum_cmul_p521): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif // Save additional registers to use pushq %rbx pushq %rbp pushq %r12 pushq %r13 // Shuffle inputs (since we want the multiplier in %rdx) movq %rdx, x movq %rsi, c // Multiply as [d9; ...; d0] = c * x. mulxq (x), d0, d1 mulxq 8(x), a, d2 addq a, d1 mulxq 16(x), a, d3 adcq a, d2 mulxq 24(x), a, d4 adcq a, d3 mulxq 32(x), a, d5 adcq a, d4 mulxq 40(x), a, d6 adcq a, d5 mulxq 48(x), a, d7 adcq a, d6 mulxq 56(x), a, d8 adcq a, d7 mulxq 64(x), a, d9 adcq a, d8 adcq $0, d9 // Create an AND "dd" of digits d7,...,d1, a computation we hope will // get nicely interleaved with the multiplication chain above. // From the point of view of architectural dependencies we have to // bunch it up here since AND destroys the flags and we overwrite the // register used as a stage temporary variable for the multiplications. movq d1, dd andq d2, dd andq d3, dd andq d4, dd andq d5, dd andq d6, dd andq d7, dd // Extract the high part h==d9 and mask off the low part l = [d8;d7;...;d0] // but stuff d8 with 1 bits at the left to ease a comparison below shldq $55, d8, h orq $~0x1FF, d8 // Decide whether h + l >= p_521 <=> h + l + 1 >= 2^521. Since this can only // happen if digits d7,...d1 are all 1s, we use the AND of them "dd" to // condense the carry chain, and since we stuffed 1 bits into d8 we get // the result in CF without an additional comparison. Hereafter we use c = 0. // Since x was assumed reduced, h cannot be maximal, so the "lea" is safe, // i.e. does not carry or wrap round. leaq 1(h), c addq d0, c movl $0, cshort adcq c, dd movq d8, a adcq c, a // Now if CF is set we want (h + l) - p_521 = (h + l + 1) - 2^521 // while otherwise we want just h + l. So mask h + l + CF to 521 bits. // This masking also gets rid of the stuffing with 1s we did above. // Write back the digits as they are generated. adcq h, d0 movq d0, (z) adcq c, d1 movq d1, 8(z) adcq c, d2 movq d2, 16(z) adcq c, d3 movq d3, 24(z) adcq c, d4 movq d4, 32(z) adcq c, d5 movq d5, 40(z) adcq c, d6 movq d6, 48(z) adcq c, d7 movq d7, 56(z) adcq c, d8 andq $0x1FF, d8 movq d8, 64(z) // Restore registers and return popq %r13 popq %r12 popq %rbp popq %rbx #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif