// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Reduce modulo group order, z := x mod n_521
// Input x[9]; output z[9]
//
//    extern void bignum_mod_n521_9
//      (uint64_t z[static 9], uint64_t x[static 9]);
//
// Reduction is modulo the group order of the NIST curve P-521.
//
// Standard ARM ABI: X0 = z, X1 = x
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n521_9)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n521_9)
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n521_9_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n521_9_alt)
        .text
        .balign 4

#define z x0
#define x x1

#define n0 x2
#define n1 x3
#define n2 x4
#define n3 x5

#define d0 x6
#define d1 x7
#define d2 x8
#define d3 x9
#define d4 x10
#define d5 x11
#define d6 x12
#define d7 x13
#define d8 x14

#define q x15

// Re-use d6 and d7 as temporaries before they are needed

#define s d6
#define t d7

#define movbig(nn,n3,n2,n1,n0)                                              \
        movz    nn, n0;                                             \
        movk    nn, n1, lsl #16;                                    \
        movk    nn, n2, lsl #32;                                    \
        movk    nn, n3, lsl #48

S2N_BN_SYMBOL(bignum_mod_n521_9):

S2N_BN_SYMBOL(bignum_mod_n521_9_alt):

// Load the top digit first into d8.
// The initial quotient estimate is q = h + 1 where x = 2^521 * h + t

        ldr     d8, [x, #64]
        lsr     q, d8, #9
        add     q, q, #1

// Let [5; n3; n2; n1; n0] = r_521 = 2^521 - n_521
// and form [d4;d3;d2;d1;d0] = q * r_521

        movbig( n0, #0x4490, #0x48e1, #0x6ec7, #0x9bf7)
        mul     d0, n0, q
        movbig( n1, #0xc44a, #0x3647, #0x7663, #0xb851)
        mul     d1, n1, q
        movbig( n2, #0x8033, #0xfeb7, #0x08f6, #0x5a2f)
        mul     d2, n2, q
        movbig( n3, #0xae79, #0x787c, #0x40d0, #0x6994)
        mul     d3, n3, q
        lsl     d4, q, #2
        add     d4, d4, q
        umulh   t, n0, q
        adds    d1, d1, t
        umulh   t, n1, q
        adcs    d2, d2, t
        umulh   t, n2, q
        adcs    d3, d3, t
        umulh   t, n3, q
        adc     d4, d4, t

// Now load other digits and form r = x - q * n_521 = (q * r_521 + t) - 2^521.
// But the computed result stuffs in 1s from bit 521 onwards and actually
// gives r' = (q * r_521 + t) + (2^576 - 2^521) = r + 2^576, including the
// top carry. Hence CF <=> r >= 0, while r' == r (mod 2^521).

        ldp     s, t, [x]
        adds    d0, d0, s
        adcs    d1, d1, t
        ldp     s, t, [x, #16]
        adcs    d2, d2, s
        adcs    d3, d3, t
        ldp     t, d5, [x, #32]
        adcs    d4, d4, t
        adcs    d5, d5, xzr
        ldp     d6, d7, [x, #48]
        adcs    d6, d6, xzr
        adcs    d7, d7, xzr
        orr     d8, d8, #~0x1FF
        adcs    d8, d8, xzr

// We already know r < n_521, but if it actually went negative then
// we need to add back n_521 again. Recycle q as a bitmask for r < n_521,
// and just subtract r_521 and mask rather than literally adding 2^521.
// This also gets rid of the bit-stuffing above.

        csetm   q, cc
        and     n0, n0, q
        subs    d0, d0, n0
        and     n1, n1, q
        sbcs    d1, d1, n1
        and     n2, n2, q
        sbcs    d2, d2, n2
        and     n3, n3, q
        sbcs    d3, d3, n3
        mov     n0, #5
        and     n0, n0, q
        sbcs    d4, d4, n0
        sbcs    d5, d5, xzr
        sbcs    d6, d6, xzr
        sbcs    d7, d7, xzr
        sbc     d8, d8, xzr
        and     d8, d8, #0x1FF

// Store the end result

        stp     d0, d1, [z]
        stp     d2, d3, [z, #16]
        stp     d4, d5, [z, #32]
        stp     d6, d7, [z, #48]
        str     d8, [z, #64]

        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif