// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384
// Input x[6]; output z[6]
//
//    extern void bignum_deamont_p384
//     (uint64_t z[static 6], uint64_t x[static 6]);
//
// Convert a 6-digit bignum x out of its (optionally almost) Montgomery form,
// "almost" meaning any 6-digit input will work, with no range restriction.
//
// Standard x86-64 ABI: RDI = z, RSI = x
// Microsoft x64 ABI:   RCX = z, RDX = x
// ----------------------------------------------------------------------------

#include "_internal_s2n_bignum.h"


        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p384)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p384)
        .text

#define z %rdi
#define x %rsi

// Additional temps in the correction phase

#define u %rax
#define v %rcx
#define w %rdx

#define vshort %ecx

// Core one-step "short" Montgomery reduction macro. Takes input in
// [d5;d4;d3;d2;d1;d0] and returns result in [d6;d5;d4;d3;d2;d1],
// adding to the existing contents of [d5;d4;d3;d2;d1;d0]. This
// is intended only for 6-word inputs as in mapping out of Montgomery,
// not for the general case of Montgomery multiplication. It is fine
// for d6 to be the same register as d0.
//
// Parms:  montreds(d6,d5,d4,d3,d2,d1,d0)
//
// We want to add (2^384 - 2^128 - 2^96 + 2^32 - 1) * w
// where w = [d0 + (d0<<32)] mod 2^64

#define montreds(d6,d5,d4,d3,d2,d1,d0)                                  \
/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */         \
        movq    d0, %rdx ;                                        \
        shlq    $32, %rdx ;                                        \
        addq    d0, %rdx ;                                        \
/* Construct [%rsi;%rcx;%rax;-] = (2^384 - p_384) * w           */         \
/* We know the lowest word will cancel so we can re-use d0   */         \
/* as a temp.                                                */         \
        xorq    %rsi, %rsi ;                                       \
        movq    $0xffffffff00000001, %rax ;                        \
        mulxq   %rax, %rcx, %rax ;                                  \
        movl    $0x00000000ffffffff, %ecx ;                        \
        mulxq   %rcx, d0, %rcx ;                                   \
        adcq    d0, %rax ;                                        \
        adcq    %rdx, %rcx ;                                       \
        adcq    $0, %rsi ;                                         \
/* Now subtract that and add 2^384 * w                       */         \
        subq    %rax, d1 ;                                        \
        sbbq    %rcx, d2 ;                                        \
        sbbq    %rsi, d3 ;                                        \
        sbbq    $0, d4 ;                                          \
        sbbq    $0, d5 ;                                          \
        movq    %rdx, d6 ;                                        \
        sbbq    $0, d6

S2N_BN_SYMBOL(bignum_deamont_p384):

#if WINDOWS_ABI
        pushq   %rdi
        pushq   %rsi
        movq    %rcx, %rdi
        movq    %rdx, %rsi
#endif

// Save more registers to play with

        pushq   %r12
        pushq   %r13

// Set up an initial window [%r13,%r12,%r11,%r10,%r9,%r8] = x

        movq    (x), %r8
        movq    8(x), %r9
        movq    16(x), %r10
        movq    24(x), %r11
        movq    32(x), %r12
        movq    40(x), %r13

// Montgomery reduce window 0

        montreds(%r8,%r13,%r12,%r11,%r10,%r9,%r8)

// Montgomery reduce window 1

        montreds(%r9,%r8,%r13,%r12,%r11,%r10,%r9)

// Montgomery reduce window 2

        montreds(%r10,%r9,%r8,%r13,%r12,%r11,%r10)

// Montgomery reduce window 3

        montreds(%r11,%r10,%r9,%r8,%r13,%r12,%r11)

// Montgomery reduce window 4

        montreds(%r12,%r11,%r10,%r9,%r8,%r13,%r12)

// Montgomery reduce window 5

        montreds(%r13,%r12,%r11,%r10,%r9,%r8,%r13)

// Do a test addition of dd = [%r13;%r12;%r11;%r10;%r9;%r8] and
// 2^384 - p_384 = [0;0;0;1;v;u], hence setting CF iff
// dd + (2^384 - p_384) >= 2^384, hence iff dd >= p_384.

        movq    $0xffffffff00000001, u
        movl    $0x00000000ffffffff, vshort

        movq    %r8, w
        addq    u, w
        movq    %r9, w
        adcq    v, w
        movq    %r10, w
        adcq    $1, w
        movq    %r11, w
        adcq    $0, w
        movq    %r12, w
        adcq    $0, w
        movq    %r13, w
        adcq    $0, w

// Convert CF to a bitmask in w

        sbbq    w, w

// Masked addition of 2^384 - p_384, hence subtraction of p_384

        andq    w, u
        andq    w, v
        andq    $1, w

        addq   u, %r8
        adcq   v, %r9
        adcq   w, %r10
        adcq   $0, %r11
        adcq   $0, %r12
        adcq   $0, %r13

// Write back the result

        movq    %r8, (z)
        movq    %r9, 8(z)
        movq    %r10, 16(z)
        movq    %r11, 24(z)
        movq    %r12, 32(z)
        movq    %r13, 40(z)

// Restore registers and return

        popq    %r13
        popq    %r12

#if WINDOWS_ABI
        popq   %rsi
        popq   %rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif