// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming
// x reduced
// Inputs c, x[6]; output z[6]
//
//    extern void bignum_cmul_p384
//     (uint64_t z[static 6], uint64_t c, uint64_t x[static 6]);
//
// Standard ARM ABI: X0 = z, X1 = c, X2 = x
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p384)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p384)
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p384_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p384_alt)
        .text
        .balign 4

#define z x0
#define c x1
#define x x2

#define d0 x2
#define d1 x3
#define d2 x4
#define d3 x5
#define d4 x6
#define d5 x7
#define a0 x8
#define a1 x9
#define a2 x10
#define a3 x11
#define a4 x12
#define a5 x13

// Some shared here

#define h x1
#define h1 x12
#define hn x13
#define m x8
#define l x9


S2N_BN_SYMBOL(bignum_cmul_p384):

S2N_BN_SYMBOL(bignum_cmul_p384_alt):

// First do the multiply, straightforwardly, getting [h; d5; ...; d0]

        ldp     a0, a1, [x]
        ldp     a2, a3, [x, #16]
        ldp     a4, a5, [x, #32]
        mul     d0, c, a0
        mul     d1, c, a1
        mul     d2, c, a2
        mul     d3, c, a3
        mul     d4, c, a4
        mul     d5, c, a5
        umulh   a0, c, a0
        umulh   a1, c, a1
        umulh   a2, c, a2
        umulh   a3, c, a3
        umulh   a4, c, a4
        umulh   h, c, a5
        adds    d1, d1, a0
        adcs    d2, d2, a1
        adcs    d3, d3, a2
        adcs    d4, d4, a3
        adcs    d5, d5, a4
        adc     h, h, xzr

// Let h be the top word of this intermediate product and l the low 6 words.
// By the range hypothesis on the input, we know h1 = h + 1 does not wrap
// And then -p_384 <= z - h1 * p_384 < p_384, so we just need to subtract
// h1 * p_384 and then correct if that is negative by adding p_384.
//
// Write p_384 = 2^384 - r where r = 2^128 + 2^96 - 2^32 + 1
//
// We want z - (h + 1) * (2^384 - r)
//       = (2^384 * h + l) - (h + 1) * (2^384 - r)
//       = (l + (h + 1) * r) - 2^384.
//
// Thus we can do the computation in 6 words of l + (h + 1) * r, and if it
// does *not* carry we need to add p_384. We can rewrite this as the following,
// using ~h = 2^64 - (h + 1) and absorbing the 2^64 in the higher term
// using h instead of h + 1.
//
//         l + (h + 1) * r
//       = l + 2^128 * (h + 1) + 2^96 * (h + 1) - 2^32 * (h + 1) + (h + 1)
//       = l + 2^128 * (h + 1) + 2^96 * h + 2^32 * ~h + (h + 1)

        add     h1, h, #1
        orn     hn, xzr, h
        lsl     a0, hn, #32
        extr    a1, h, hn, #32
        lsr     a2, h, #32

        adds    a0, a0, h1
        adcs    a1, a1, xzr
        adcs    a2, a2, h1
        adc     a3, xzr, xzr

        adds    d0, d0, a0
        adcs    d1, d1, a1
        adcs    d2, d2, a2
        adcs    d3, d3, a3
        adcs    d4, d4, xzr
        adcs    d5, d5, xzr

// Catch the carry and do a masked addition of p_384

        csetm   m, cc

        mov     l, #0x00000000ffffffff
        and     l, l, m
        adds    d0, d0, l
        eor     l, l, m
        adcs    d1, d1, l
        mov     l, #0xfffffffffffffffe
        and     l, l, m
        adcs    d2, d2, l
        adcs    d3, d3, m
        adcs    d4, d4, m
        adc     d5, d5, m

// Store the result

        stp     d0, d1, [z]
        stp     d2, d3, [z, #16]
        stp     d4, d5, [z, #32]

        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif