// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 // Input x[9]; output z[9] // // extern void bignum_mod_n521_9 // (uint64_t z[static 9], uint64_t x[static 9]); // // Reduction is modulo the group order of the NIST curve P-521. // // Standard x86-64 ABI: RDI = z, RSI = x // Microsoft x64 ABI: RCX = z, RDX = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n521_9) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n521_9) .text #define z %rdi #define x %rsi #define q %rdx #define a %rax #define c %rcx #define d %r8 #define n0 %r9 #define n1 %r10 #define n2 %r11 #define n3 d #define ashort %eax #define cshort %ecx #define qshort %edx S2N_BN_SYMBOL(bignum_mod_n521_9): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif // Load the top digit, putting a bit-stuffed version in output buffer. // The initial quotient estimate is q = h + 1 where x = 2^521 * h + t // The last add also clears the CF and OF flags ready for the carry chain. movq 64(x), q movq $~0x1FF, a orq q, a movq a, 64(z) shrq $9, q addq $1, q // Now load other digits and form r = x - q * n_521 = (q * r_521 + t) - 2^521, // which is stored in the output buffer. Thanks to the bit-stuffing at the // start, we get r' = (q * r_521 + t) + (2^576 - 2^521) = r + 2^576 as the // computed result including the top carry. Hence CF <=> r >= 0, while // r' == r (mod 2^521) because things below bit 521 are uncorrupted. We // keep the top word in the register c since we at least have that one free. movq $0x449048e16ec79bf7, n0 mulxq n0, a, c adcxq (x), a movq a, (z) movq $0xc44a36477663b851, n1 mulxq n1, a, d adcxq 8(x), a adoxq c, a movq a, 8(z) movq $0x8033feb708f65a2f, n2 mulxq n2, a, c adcxq 16(x), a adoxq d, a movq a, 16(z) movq $0xae79787c40d06994, a mulxq a, a, d adcxq 24(x), a adoxq c, a movq a, 24(z) movl $5, ashort mulxq a, a, c adcxq 32(x), a adoxq d, a movq a, 32(z) movq c, a // a is now used for zero hereafter adoxq c, c adcq 40(x), c movq c, 40(z) movq 48(x), c adcq a, c movq c, 48(z) movq 56(x), c adcq a, c movq c, 56(z) movq 64(z), c adcq a, c // We already know r < n_521, but if it actually went negative then // we need to add back n_521 again. Recycle q as a bitmask for r < n_521, // and just subtract r_521 and mask rather than literally adding 2^521. // This also gets rid of the bit-stuffing above. cmc sbbq q, q andq q, n0 andq q, n1 andq q, n2 movq $0xae79787c40d06994, n3 andq q, n3 andl $5, qshort subq n0, (z) sbbq n1, 8(z) sbbq n2, 16(z) sbbq n3, 24(z) sbbq q, 32(z) sbbq a, 40(z) sbbq a, 48(z) sbbq a, 56(z) sbbl ashort, cshort andl $0x1FF, cshort movq c, 64(z) #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif