// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 // Input x[k]; output z[6] // // extern void bignum_mod_n384 // (uint64_t z[static 6], uint64_t k, uint64_t *x); // // Reduction is modulo the group order of the NIST curve P-384. // // Standard x86-64 ABI: RDI = z, RSI = k, RDX = x // Microsoft x64 ABI: RCX = z, RDX = k, R8 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n384) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n384) .text #define z %rdi #define k %rsi #define x %rcx #define m0 %r8 #define m1 %r9 #define m2 %r10 #define m3 %r11 #define m4 %r12 #define m5 %r13 #define d %r14 #define n0 %rax #define n1 %rbx #define n2 %rdx #define q %rdx #define n0short %eax #define qshort %edx S2N_BN_SYMBOL(bignum_mod_n384): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif // Save extra registers pushq %rbx pushq %r12 pushq %r13 pushq %r14 // If the input is already <= 5 words long, go to a trivial "copy" path cmpq $6, k jc shortinput // Otherwise load the top 6 digits (top-down) and reduce k by 6 subq $6, k movq 40(%rdx,k,8), m5 movq 32(%rdx,k,8), m4 movq 24(%rdx,k,8), m3 movq 16(%rdx,k,8), m2 movq 8(%rdx,k,8), m1 movq (%rdx,k,8), m0 // Move x into another register to leave %rdx free for multiplies and use of n2 movq %rdx, x // Reduce the top 6 digits mod n_384 (a conditional subtraction of n_384) movq $0x1313e695333ad68d, n0 movq $0xa7e5f24db74f5885, n1 movq $0x389cb27e0bc8d220, n2 addq n0, m0 adcq n1, m1 adcq n2, m2 adcq $0, m3 adcq $0, m4 adcq $0, m5 sbbq d, d notq d andq d, n0 andq d, n1 andq d, n2 subq n0, m0 sbbq n1, m1 sbbq n2, m2 sbbq $0, m3 sbbq $0, m4 sbbq $0, m5 // Now do (k-6) iterations of 7->6 word modular reduction testq k, k jz writeback loop: // Compute q = min (m5 + 1) (2^64 - 1) movl $1, qshort addq m5, q sbbq d, d orq d, q // Load the next digit so current m to reduce = [m5;m4;m3;m2;m1;m0;d] movq -8(x,k,8), d // Now form [m5;m4;m3;m2;m1;m0;d] = m - q * n_384 subq q, m5 xorq n0, n0 movq $0x1313e695333ad68d, n0 mulxq n0, n0, n1 adcxq n0, d adoxq n1, m0 movq $0xa7e5f24db74f5885, n0 mulxq n0, n0, n1 adcxq n0, m0 adoxq n1, m1 movq $0x389cb27e0bc8d220, n0 mulxq n0, n0, n1 adcxq n0, m1 movl $0, n0short adoxq n0, n1 adcxq n1, m2 adcq $0, m3 adcq $0, m4 adcq $0, m5 // Now our top word m5 is either zero or all 1s. Use it for a masked // addition of n_384, which we can do by a *subtraction* of // 2^384 - n_384 from our portion movq $0x1313e695333ad68d, n0 andq m5, n0 movq $0xa7e5f24db74f5885, n1 andq m5, n1 movq $0x389cb27e0bc8d220, n2 andq m5, n2 subq n0, d sbbq n1, m0 sbbq n2, m1 sbbq $0, m2 sbbq $0, m3 sbbq $0, m4 // Now shuffle registers up and loop movq m4, m5 movq m3, m4 movq m2, m3 movq m1, m2 movq m0, m1 movq d, m0 decq k jnz loop // Write back writeback: movq m0, (z) movq m1, 8(z) movq m2, 16(z) movq m3, 24(z) movq m4, 32(z) movq m5, 40(z) // Restore registers and return popq %r14 popq %r13 popq %r12 popq %rbx #if WINDOWS_ABI popq %rsi popq %rdi #endif ret shortinput: xorq m0, m0 xorq m1, m1 xorq m2, m2 xorq m3, m3 xorq m4, m4 xorq m5, m5 testq k, k jz writeback movq (%rdx), m0 decq k jz writeback movq 8(%rdx), m1 decq k jz writeback movq 16(%rdx), m2 decq k jz writeback movq 24(%rdx), m3 decq k jz writeback movq 32(%rdx), m4 jmp writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif