// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming
// x reduced
// Inputs c, x[9]; output z[9]
//
//    extern void bignum_cmul_p521_alt
//     (uint64_t z[static 9], uint64_t c, uint64_t x[static 9]);
//
// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x
// Microsoft x64 ABI:   RCX = z, RDX = c, R8 = x
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum.h"


        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p521_alt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p521_alt)
        .text

#define z %rdi

// Temporarily moved here for initial multiply

#define x %rcx

// Likewise this is thrown away after initial multiply

#define m %rsi

#define c %rdx
#define cshort %edx

#define a %rax
#define d %rdx

#define dd %rax

// Digits: last ones aliased to inputs that are no longer used then

#define d0 %r8
#define d1 %r9
#define d2 %r10
#define d3 %r11
#define d4 %rbx
#define d5 %rbp
#define d6 %r12
#define d7 %r13
#define d8 %rcx
#define d9 %rsi

// Same as d9

#define h d9

S2N_BN_SYMBOL(bignum_cmul_p521_alt):

#if WINDOWS_ABI
        pushq   %rdi
        pushq   %rsi
        movq    %rcx, %rdi
        movq    %rdx, %rsi
        movq    %r8, %rdx
#endif

// Save additional registers to use

        pushq   %rbx
        pushq   %rbp
        pushq   %r12
        pushq   %r13

// Shuffle inputs (since we want %rdx for the high parts of products)

        movq    %rdx, x

// Multiply as [d9; ...; d0] = c * x.

        movq    (x), a
        mulq    m
        movq    a, d0
        movq    d, d1

        movq    8(x), a
        mulq    m
        xorq    d2, d2
        addq    a, d1
        adcq    d, d2

        movq    16(x), a
        mulq    m
        xorq    d3, d3
        addq    a, d2
        adcq    d, d3

        movq    24(x), a
        mulq    m
        xorq    d4, d4
        addq    a, d3
        adcq    d, d4

        movq    32(x), a
        mulq    m
        xorq    d5, d5
        addq    a, d4
        adcq    d, d5

        movq    40(x), a
        mulq    m
        xorq    d6, d6
        addq    a, d5
        adcq    d, d6

        movq    48(x), a
        mulq    m
        xorq    d7, d7
        addq    a, d6
        adcq    d, d7

        movq    56(x), a
        mulq    m
        addq    a, d7
        movq    64(x), a
        movq    $0, d8
        adcq    d, d8
        mulq    m
        xorq    d9, d9
        addq    a, d8
        adcq    d, d9

// Create an AND "dd" of digits d7,...,d1, a computation we hope will
// get nicely interleaved with the multiplication chain above, though
// we can't do so directly as we are using the same register %rax.

        movq    d1, dd
        andq    d2, dd
        andq    d3, dd
        andq    d4, dd
        andq    d5, dd
        andq    d6, dd
        andq    d7, dd

// Extract the high part h==d9 and mask off the low part l = [d8;d7;...;d0]
// but stuff d8 with 1 bits at the left to ease a comparison below

        shldq   $55, d8, h
        orq     $~0x1FF, d8

// Decide whether h + l >= p_521 <=> h + l + 1 >= 2^521. Since this can only
// happen if digits d7,...d1 are all 1s, we use the AND of them "dd" to
// condense the carry chain, and since we stuffed 1 bits into d8 we get
// the result in CF without an additional comparison. Hereafter we use c = 0.
// Since x was assumed reduced, h cannot be maximal, so the "lea" is safe,
// i.e. does not carry or wrap round.

        leaq    1(h), c
        addq    d0, c
        movl    $0, cshort
        adcq    c, dd
        movq    d8, a
        adcq    c, a

// Now if CF is set we want (h + l) - p_521 = (h + l + 1) - 2^521
// while otherwise we want just h + l. So mask h + l + CF to 521 bits.
// This masking also gets rid of the stuffing with 1s we did above.
// Write back the digits as they are generated.

        adcq    h, d0
        movq    d0, (z)
        adcq    c, d1
        movq    d1, 8(z)
        adcq    c, d2
        movq    d2, 16(z)
        adcq    c, d3
        movq    d3, 24(z)
        adcq    c, d4
        movq    d4, 32(z)
        adcq    c, d5
        movq    d5, 40(z)
        adcq    c, d6
        movq    d6, 48(z)
        adcq    c, d7
        movq    d7, 56(z)
        adcq    c, d8
        andq    $0x1FF, d8
        movq    d8, 64(z)

// Restore registers and return

        popq    %r13
        popq    %r12
        popq    %rbp
        popq    %rbx

#if WINDOWS_ABI
        popq   %rsi
        popq   %rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif