// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
// Inputs x[6], y[6]; output z[6]
//
//    extern void bignum_add_p384
//     (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]);
//
// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum.h"


        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p384)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p384)
        .text

#define z %rdi
#define x %rsi
#define y %rdx

#define d0 %rax
#define d1 %rcx
#define d2 %r8
#define d3 %r9
#define d4 %r10
#define d5 %r11

// Re-use the input pointers as temporaries once we're done

#define a %rsi
#define c %rdx

#define ashort %esi
#define cshort %edx


S2N_BN_SYMBOL(bignum_add_p384):

#if WINDOWS_ABI
        pushq   %rdi
        pushq   %rsi
        movq    %rcx, %rdi
        movq    %rdx, %rsi
        movq    %r8, %rdx
#endif

// Add the inputs as 2^384 * c + [d5;d4;d3;d2;d1;d0] = x + y
// This could be combined with the next block using ADCX and ADOX.

        movq    (x), d0
        addq    (y), d0
        movq    8(x), d1
        adcq    8(y), d1
        movq    16(x), d2
        adcq    16(y), d2
        movq    24(x), d3
        adcq    24(y), d3
        movq    32(x), d4
        adcq    32(y), d4
        movq    40(x), d5
        adcq    40(y), d5
        movl    $0, cshort
        adcq    c, c

// Now subtract p_384 from 2^384 * c + [d5;d4;d3;d2;d1;d0] to get x + y - p_384
// This is actually done by *adding* the 7-word negation r_384 = 2^448 - p_384
// where r_384 = [-1; 0; 0; 0; 1; 0x00000000ffffffff; 0xffffffff00000001]

        movq    $0xffffffff00000001, a
        addq    a, d0
        movl    $0x00000000ffffffff, ashort
        adcq    a, d1
        adcq    $1, d2
        adcq    $0, d3
        adcq    $0, d4
        adcq    $0, d5
        adcq    $-1, c

// Since by hypothesis x < p_384 we know x + y - p_384 < 2^384, so the top
// carry c actually gives us a bitmask for x + y - p_384 < 0, which we
// now use to make r' = mask * (2^384 - p_384) for a compensating subtraction.
// We don't quite have enough ABI-modifiable registers to create all three
// nonzero digits of r while maintaining d0..d5, but make the first two now.

        andq    a, c // c = masked 0x00000000ffffffff
        xorq    a, a
        subq    c, a // a = masked 0xffffffff00000001

// Do the first two digits of addition and writeback

        subq    a, d0
        movq    d0, (z)
        sbbq    c, d1
        movq    d1, 8(z)

// Preserve the carry chain while creating the extra masked digit since
// the logical operation will clear CF

        sbbq    d0, d0
        andq    a, c // c = masked 0x0000000000000001
        negq    d0

// Do the rest of the addition and writeback

        sbbq    c, d2
        movq    d2, 16(z)
        sbbq    $0, d3
        movq    d3, 24(z)
        sbbq    $0, d4
        movq    d4, 32(z)
        sbbq    $0, d5
        movq    d5, 40(z)

#if WINDOWS_ABI
        popq   %rsi
        popq   %rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif