// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Reduce modulo group order, z := x mod n_521
// Input x[9]; output z[9]
//
//    extern void bignum_mod_n521_9_alt
//      (uint64_t z[static 9], uint64_t x[static 9]);
//
// Reduction is modulo the group order of the NIST curve P-521.
//
// Standard x86-64 ABI: RDI = z, RSI = x
// Microsoft x64 ABI:   RCX = z, RDX = x
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum.h"


        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n521_9_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n521_9_alt)
        .text

#define z %rdi
#define x %rsi

#define q %rcx
#define a %rax
#define d %rdx

#define c %rcx

#define n0 %r8
#define n1 %r9
#define n2 %r10
#define n3 %r11

#define ashort %eax
#define cshort %ecx
#define qshort %edx

S2N_BN_SYMBOL(bignum_mod_n521_9_alt):

#if WINDOWS_ABI
        pushq   %rdi
        pushq   %rsi
        movq    %rcx, %rdi
        movq    %rdx, %rsi
#endif

// Load the top digit, putting a bit-stuffed version in output buffer.
// The initial quotient estimate is q = h + 1 where x = 2^521 * h + t
// The last add also clears the CF and OF flags ready for the carry chain.

        movq    64(x), q
        movq    $~0x1FF, a
        orq     q, a
        movq    a, 64(z)
        shrq    $9, q
        addq    $1, q

// Now load other digits and form r = x - q * n_521 = (q * r_521 + t) - 2^521,
// which is stored in the output buffer. Thanks to the bit-stuffing at the
// start, we get r' = (q * r_521 + t) + (2^576 - 2^521) = r + 2^576 as the
// computed result including the top carry. Hence CF <=> r >= 0, while
// r' == r (mod 2^521) because things below bit 521 are uncorrupted. We
// keep the top word in the register c since we at least have that one free.

        movq    $0x449048e16ec79bf7, %rax
        mulq    q
        movq    %rax, n0
        movq    %rdx, n1

        movq    $0xc44a36477663b851, %rax
        mulq    q
        xorq    n2, n2
        addq    %rax, n1
        adcq    %rdx, n2

        movq    $0x8033feb708f65a2f, %rax
        mulq    q
        xorq    n3, n3
        addq    %rax, n2
        adcq    %rdx, n3

        movq    $0xae79787c40d06994, %rax
        mulq    q
        imulq   $5, q
        addq    %rax, n3
        adcq    %rdx, q
        sbbq    %rdx, %rdx
        negq    %rdx

// [%rdx;q;n3;n2;n1;n0] = q * r_521

        xorl    %eax, %eax // %rax is used as a zero hereafter
        addq    (x), n0
        movq    n0, (z)
        adcq    8(x), n1
        movq    n1, 8(z)
        adcq    16(x), n2
        movq    n2, 16(z)
        adcq    24(x), n3
        movq    n3, 24(z)
        adcq    32(x), q
        movq    q, 32(z)
        adcq    40(x), %rdx
        movq    %rdx, 40(z)
        movq    48(x), d
        adcq    %rax, d
        movq    d, 48(z)
        movq    56(x), d
        adcq    %rax, d
        movq    d, 56(z)
        movq    64(z), c
        adcq    %rax, c

// We already know r < n_521, but if it actually went negative then
// we need to add back n_521 again. Use d as a bitmask for r < n_521,
// and just subtract r_521 and mask rather than literally adding 2^521.
// This also gets rid of the bit-stuffing above.

        cmc
        sbbq    d, d

        movq    $0x449048e16ec79bf7, n0
        andq    d, n0
        movq    $0xc44a36477663b851, n1
        andq    d, n1
        movq    $0x8033feb708f65a2f, n2
        andq    d, n2
        movq    $0xae79787c40d06994, n3
        andq    d, n3
        andq    $5, d

        subq    n0, (z)
        sbbq    n1, 8(z)
        sbbq    n2, 16(z)
        sbbq    n3, 24(z)
        sbbq    d, 32(z)
        sbbq    %rax, 40(z)
        sbbq    %rax, 48(z)
        sbbq    %rax, 56(z)
        sbbl    ashort, cshort
        andl    $0x1FF, cshort
        movq    c, 64(z)

#if WINDOWS_ABI
        popq   %rsi
        popq   %rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif