// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
// Input x[6]; output z[6]
//
//    extern void bignum_deamont_p384
//     (uint64_t z[static 6], uint64_t x[static 6]);
//
// Convert a 6-digit bignum x out of its (optionally almost) Montgomery form,
// "almost" meaning any 6-digit input will work, with no range restriction.
//
// Standard ARM ABI: X0 = z, X1 = x
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p384)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p384)
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p384_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p384_alt)
        .text
        .balign 4

// ---------------------------------------------------------------------------
// Core one-step "short" Montgomery reduction macro. Takes input in
// [d5;d4;d3;d2;d1;d0] and returns result in [d6;d5;d4;d3;d2;d1],
// adding to the existing contents of [d5;d4;d3;d2;d1]. It is fine
// for d6 to be the same register as d0.
//
// We want to add (2^384 - 2^128 - 2^96 + 2^32 - 1) * w
// where w = [d0 + (d0<<32)] mod 2^64
// ---------------------------------------------------------------------------

#define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1)                            \
/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64            */  \
/* Recycle d0 (which we know gets implicitly cancelled) to store it     */  \
        lsl     t1, d0, #32;                                        \
        add     d0, t1, d0;                                         \
/* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32)     */  \
/* We need to subtract 2^32 * this, and we can ignore its lower 32      */  \
/* bits since by design it will cancel anyway; we only need the w_hi    */  \
/* part to get the carry propagation going.                             */  \
        lsr     t1, d0, #32;                                        \
        subs    t1, t1, d0;                                         \
        sbc     t2, d0, xzr;                                        \
/* Now select in t1 the field to subtract from d1                       */  \
        extr    t1, t2, t1, #32;                                    \
/* And now get the terms to subtract from d2 and d3                     */  \
        lsr     t2, t2, #32;                                        \
        adds    t2, t2, d0;                                         \
        adc     t3, xzr, xzr;                                       \
/* Do the subtraction of that portion                                   */  \
        subs    d1, d1, t1;                                         \
        sbcs    d2, d2, t2;                                         \
        sbcs    d3, d3, t3;                                         \
        sbcs    d4, d4, xzr;                                        \
        sbcs    d5, d5, xzr;                                        \
/* Now effectively add 2^384 * w by taking d0 as the input for last sbc */  \
        sbc     d6, d0, xzr

// Input parameters

#define z x0
#define x x1

// Rotating registers for the intermediate windows

#define d0 x2
#define d1 x3
#define d2 x4
#define d3 x5
#define d4 x6
#define d5 x7

// Other temporaries

#define u x8
#define v x9
#define w x10

S2N_BN_SYMBOL(bignum_deamont_p384):

S2N_BN_SYMBOL(bignum_deamont_p384_alt):

// Set up an initial window with the input x and an extra leading zero

        ldp     d0, d1, [x]
        ldp     d2, d3, [x, #16]
        ldp     d4, d5, [x, #32]

// Systematically scroll left doing 1-step reductions

        montreds(d0,d5,d4,d3,d2,d1,d0, u,v,w)

        montreds(d1,d0,d5,d4,d3,d2,d1, u,v,w)

        montreds(d2,d1,d0,d5,d4,d3,d2, u,v,w)

        montreds(d3,d2,d1,d0,d5,d4,d3, u,v,w)

        montreds(d4,d3,d2,d1,d0,d5,d4, u,v,w)

        montreds(d5,d4,d3,d2,d1,d0,d5, u,v,w)

// Now compare end result in [d5;d4;d3;d2;d1;d0] = dd with p_384 by *adding*
// 2^384 - p_384 = [0;0;0;w;v;u]. This will set CF if
// dd + (2^384 - p_384) >= 2^384, hence iff dd >= p_384

        mov     u, #0xffffffff00000001
        mov     v, #0x00000000ffffffff
        mov     w, #0x0000000000000001

        adds    xzr, d0, u
        adcs    xzr, d1, v
        adcs    xzr, d2, w
        adcs    xzr, d3, xzr
        adcs    xzr, d4, xzr
        adcs    xzr, d5, xzr

// Convert the condition dd >= p_384 into a bitmask in w and do a masked
// subtraction of p_384, via a masked addition of 2^384 - p_384:

        csetm   w, cs
        and     u, u, w
        adds    d0, d0, u
        and     v, v, w
        adcs    d1, d1, v
        and     w, w, #1
        adcs    d2, d2, w
        adcs    d3, d3, xzr
        adcs    d4, d4, xzr
        adc     d5, d5, xzr

// Store it back

        stp     d0, d1, [z]
        stp     d2, d3, [z, #16]
        stp     d4, d5, [z, #32]

        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif