// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Convert to Montgomery form z := (2^384 * x) mod p_384
// Input x[6]; output z[6]
//
//    extern void bignum_tomont_p384
//     (uint64_t z[static 6], uint64_t x[static 6]);
//
// Standard ARM ABI: X0 = z, X1 = x
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p384)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p384)
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p384_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p384_alt)
        .text
        .balign 4

// ----------------------------------------------------------------------------
// Core "x |-> (2^64 * x) mod p_384" macro, with x assumed to be < p_384.
// Input is in [d6;d5;d4;d3;d2;d1] and output in [d5;d4;d3;d2;d1;d0]
// using d6 as well as t1, t2, t3 as temporaries.
// ----------------------------------------------------------------------------

#define modstep_p384(d6,d5,d4,d3,d2,d1,d0, t1,t2,t3)                        \
/* Initial quotient approximation q = min (h + 1) (2^64 - 1) */             \
        adds    d6, d6, #1;                                         \
        csetm   t3, cs;                                             \
        add     d6, d6, t3;                                         \
        orn     t3, xzr, t3;                                        \
        sub     t2, d6, #1;                                         \
        sub     t1, xzr, d6;                                        \
/* Correction term [d6;t2;t1;d0] = q * (2^384 - p_384) */                   \
        lsl     d0, t1, #32;                                        \
        extr    t1, t2, t1, #32;                                    \
        lsr     t2, t2, #32;                                        \
        adds    d0, d0, d6;                                         \
        adcs    t1, t1, xzr;                                        \
        adcs    t2, t2, d6;                                         \
        adc     d6, xzr, xzr;                                       \
/* Addition to the initial value */                                         \
        adds    d1, d1, t1;                                         \
        adcs    d2, d2, t2;                                         \
        adcs    d3, d3, d6;                                         \
        adcs    d4, d4, xzr;                                        \
        adcs    d5, d5, xzr;                                        \
        adc     t3, t3, xzr;                                        \
/* Use net top of the 7-word answer in t3 for masked correction */          \
        mov     t1, #0x00000000ffffffff;                            \
        and     t1, t1, t3;                                         \
        adds    d0, d0, t1;                                         \
        eor     t1, t1, t3;                                         \
        adcs    d1, d1, t1;                                         \
        mov     t1, #0xfffffffffffffffe;                            \
        and     t1, t1, t3;                                         \
        adcs    d2, d2, t1;                                         \
        adcs    d3, d3, t3;                                         \
        adcs    d4, d4, t3;                                         \
        adc     d5, d5, t3

S2N_BN_SYMBOL(bignum_tomont_p384):

S2N_BN_SYMBOL(bignum_tomont_p384_alt):

#define d0 x2
#define d1 x3
#define d2 x4
#define d3 x5
#define d4 x6
#define d5 x7
#define d6 x8

#define t1 x9
#define t2 x10
#define t3 x11

#define n0 x8
#define n1 x9
#define n2 x10
#define n3 x11
#define n4 x12
#define n5 x1

// Load the inputs

        ldp     d0, d1, [x1]
        ldp     d2, d3, [x1, #16]
        ldp     d4, d5, [x1, #32]

// Do an initial reduction to make sure this is < p_384, using just
// a copy of the bignum_mod_p384_6 code. This is needed to set up the
// invariant "input < p_384" for the main modular reduction steps.

        mov     n0, #0x00000000ffffffff
        mov     n1, #0xffffffff00000000
        mov     n2, #0xfffffffffffffffe
        subs    n0, d0, n0
        sbcs    n1, d1, n1
        sbcs    n2, d2, n2
        adcs    n3, d3, xzr
        adcs    n4, d4, xzr
        adcs    n5, d5, xzr
        csel    d0, d0, n0, cc
        csel    d1, d1, n1, cc
        csel    d2, d2, n2, cc
        csel    d3, d3, n3, cc
        csel    d4, d4, n4, cc
        csel    d5, d5, n5, cc

// Successively multiply by 2^64 and reduce

        modstep_p384(d5,d4,d3,d2,d1,d0,d6, t1,t2,t3)
        modstep_p384(d4,d3,d2,d1,d0,d6,d5, t1,t2,t3)
        modstep_p384(d3,d2,d1,d0,d6,d5,d4, t1,t2,t3)
        modstep_p384(d2,d1,d0,d6,d5,d4,d3, t1,t2,t3)
        modstep_p384(d1,d0,d6,d5,d4,d3,d2, t1,t2,t3)
        modstep_p384(d0,d6,d5,d4,d3,d2,d1, t1,t2,t3)

// Store the result and return

        stp     d1, d2, [x0]
        stp     d3, d4, [x0, #16]
        stp     d5, d6, [x0, #32]

        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif