// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// ----------------------------------------------------------------------------
// The x25519 function for curve25519 (byte array arguments)
// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes)
//
// extern void curve25519_x25519_byte
//   (uint8_t res[static 32],uint8_t scalar[static 32],uint8_t point[static 32])
//
// Given a scalar n and the X coordinate of an input point P = (X,Y) on
// curve25519 (Y can live in any extension field of characteristic 2^255-19),
// this returns the X coordinate of n * P = (X, Y), or 0 when n * P is the
// point at infinity. Both n and X inputs are first slightly modified/mangled
// as specified in the relevant RFC (https://www.rfc-editor.org/rfc/rfc7748);
// in particular the lower three bits of n are set to zero.
//
// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_x25519_byte)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_x25519_byte)

        .text
        .balign 4

// Size of individual field elements

#define NUMSIZE 32

// Stable homes for the input result argument during the whole body
// and other variables that are only needed prior to the modular inverse.

#define res x23
#define i x20
#define swap x21

// Pointers to result x coord to be written

#define resx res, #0

// Pointer-offset pairs for temporaries on stack with some aliasing.

#define scalar sp, #(0*NUMSIZE)

#define pointx sp, #(1*NUMSIZE)

#define zm sp, #(2*NUMSIZE)
#define sm sp, #(2*NUMSIZE)
#define dpro sp, #(2*NUMSIZE)

#define sn sp, #(3*NUMSIZE)

#define dm sp, #(4*NUMSIZE)

#define zn sp, #(5*NUMSIZE)
#define dn sp, #(5*NUMSIZE)
#define e sp, #(5*NUMSIZE)

#define dmsn sp, #(6*NUMSIZE)
#define p sp, #(6*NUMSIZE)

#define xm sp, #(7*NUMSIZE)
#define dnsm sp, #(7*NUMSIZE)
#define spro sp, #(7*NUMSIZE)

#define d sp, #(8*NUMSIZE)

#define xn sp, #(9*NUMSIZE)
#define s sp, #(9*NUMSIZE)

// Total size to reserve on the stack

#define NSPACE (10*NUMSIZE)

// Macro wrapping up the basic field operation bignum_mul_p25519, only
// trivially different from a pure function call to that subroutine.

#define mul_p25519(P0,P1,P2)                    \
        ldp     x3, x4, [P1];                   \
        ldp     x5, x6, [P2];                   \
        umull   x7, w3, w5;                     \
        lsr     x0, x3, #32;                    \
        umull   x15, w0, w5;                    \
        lsr     x16, x5, #32;                   \
        umull   x8, w16, w0;                    \
        umull   x16, w3, w16;                   \
        adds    x7, x7, x15, lsl #32;           \
        lsr     x15, x15, #32;                  \
        adc     x8, x8, x15;                    \
        adds    x7, x7, x16, lsl #32;           \
        lsr     x16, x16, #32;                  \
        adc     x8, x8, x16;                    \
        mul     x9, x4, x6;                     \
        umulh   x10, x4, x6;                    \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x16, cc;                        \
        adds    x9, x9, x8;                     \
        adc     x10, x10, xzr;                  \
        subs    x3, x5, x6;                     \
        cneg    x3, x3, cc;                     \
        cinv    x16, x16, cc;                   \
        mul     x15, x4, x3;                    \
        umulh   x3, x4, x3;                     \
        adds    x8, x7, x9;                     \
        adcs    x9, x9, x10;                    \
        adc     x10, x10, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x15, x15, x16;                  \
        adcs    x8, x15, x8;                    \
        eor     x3, x3, x16;                    \
        adcs    x9, x3, x9;                     \
        adc     x10, x10, x16;                  \
        ldp     x3, x4, [P1+16];                \
        ldp     x5, x6, [P2+16];                \
        umull   x11, w3, w5;                    \
        lsr     x0, x3, #32;                    \
        umull   x15, w0, w5;                    \
        lsr     x16, x5, #32;                   \
        umull   x12, w16, w0;                   \
        umull   x16, w3, w16;                   \
        adds    x11, x11, x15, lsl #32;         \
        lsr     x15, x15, #32;                  \
        adc     x12, x12, x15;                  \
        adds    x11, x11, x16, lsl #32;         \
        lsr     x16, x16, #32;                  \
        adc     x12, x12, x16;                  \
        mul     x13, x4, x6;                    \
        umulh   x14, x4, x6;                    \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x16, cc;                        \
        adds    x13, x13, x12;                  \
        adc     x14, x14, xzr;                  \
        subs    x3, x5, x6;                     \
        cneg    x3, x3, cc;                     \
        cinv    x16, x16, cc;                   \
        mul     x15, x4, x3;                    \
        umulh   x3, x4, x3;                     \
        adds    x12, x11, x13;                  \
        adcs    x13, x13, x14;                  \
        adc     x14, x14, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x15, x15, x16;                  \
        adcs    x12, x15, x12;                  \
        eor     x3, x3, x16;                    \
        adcs    x13, x3, x13;                   \
        adc     x14, x14, x16;                  \
        ldp     x3, x4, [P1+16];                \
        ldp     x15, x16, [P1];                 \
        subs    x3, x3, x15;                    \
        sbcs    x4, x4, x16;                    \
        csetm   x16, cc;                        \
        ldp     x15, x0, [P2];                  \
        subs    x5, x15, x5;                    \
        sbcs    x6, x0, x6;                     \
        csetm   x0, cc;                         \
        eor     x3, x3, x16;                    \
        subs    x3, x3, x16;                    \
        eor     x4, x4, x16;                    \
        sbc     x4, x4, x16;                    \
        eor     x5, x5, x0;                     \
        subs    x5, x5, x0;                     \
        eor     x6, x6, x0;                     \
        sbc     x6, x6, x0;                     \
        eor     x16, x0, x16;                   \
        adds    x11, x11, x9;                   \
        adcs    x12, x12, x10;                  \
        adcs    x13, x13, xzr;                  \
        adc     x14, x14, xzr;                  \
        mul     x2, x3, x5;                     \
        umulh   x0, x3, x5;                     \
        mul     x15, x4, x6;                    \
        umulh   x1, x4, x6;                     \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x9, cc;                         \
        adds    x15, x15, x0;                   \
        adc     x1, x1, xzr;                    \
        subs    x6, x5, x6;                     \
        cneg    x6, x6, cc;                     \
        cinv    x9, x9, cc;                     \
        mul     x5, x4, x6;                     \
        umulh   x6, x4, x6;                     \
        adds    x0, x2, x15;                    \
        adcs    x15, x15, x1;                   \
        adc     x1, x1, xzr;                    \
        cmn     x9, #0x1;                       \
        eor     x5, x5, x9;                     \
        adcs    x0, x5, x0;                     \
        eor     x6, x6, x9;                     \
        adcs    x15, x6, x15;                   \
        adc     x1, x1, x9;                     \
        adds    x9, x11, x7;                    \
        adcs    x10, x12, x8;                   \
        adcs    x11, x13, x11;                  \
        adcs    x12, x14, x12;                  \
        adcs    x13, x13, xzr;                  \
        adc     x14, x14, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x2, x2, x16;                    \
        adcs    x9, x2, x9;                     \
        eor     x0, x0, x16;                    \
        adcs    x10, x0, x10;                   \
        eor     x15, x15, x16;                  \
        adcs    x11, x15, x11;                  \
        eor     x1, x1, x16;                    \
        adcs    x12, x1, x12;                   \
        adcs    x13, x13, x16;                  \
        adc     x14, x14, x16;                  \
        mov     x3, #0x26;                      \
        umull   x4, w11, w3;                    \
        add     x4, x4, w7, uxtw;               \
        lsr     x7, x7, #32;                    \
        lsr     x11, x11, #32;                  \
        umaddl  x11, w11, w3, x7;               \
        mov     x7, x4;                         \
        umull   x4, w12, w3;                    \
        add     x4, x4, w8, uxtw;               \
        lsr     x8, x8, #32;                    \
        lsr     x12, x12, #32;                  \
        umaddl  x12, w12, w3, x8;               \
        mov     x8, x4;                         \
        umull   x4, w13, w3;                    \
        add     x4, x4, w9, uxtw;               \
        lsr     x9, x9, #32;                    \
        lsr     x13, x13, #32;                  \
        umaddl  x13, w13, w3, x9;               \
        mov     x9, x4;                         \
        umull   x4, w14, w3;                    \
        add     x4, x4, w10, uxtw;              \
        lsr     x10, x10, #32;                  \
        lsr     x14, x14, #32;                  \
        umaddl  x14, w14, w3, x10;              \
        mov     x10, x4;                        \
        lsr     x0, x14, #31;                   \
        mov     x5, #0x13;                      \
        umaddl  x5, w5, w0, x5;                 \
        add     x7, x7, x5;                     \
        adds    x7, x7, x11, lsl #32;           \
        extr    x3, x12, x11, #32;              \
        adcs    x8, x8, x3;                     \
        extr    x3, x13, x12, #32;              \
        adcs    x9, x9, x3;                     \
        extr    x3, x14, x13, #32;              \
        lsl     x5, x0, #63;                    \
        eor     x10, x10, x5;                   \
        adc     x10, x10, x3;                   \
        mov     x3, #0x13;                      \
        tst     x10, #0x8000000000000000;       \
        csel    x3, x3, xzr, pl;                \
        subs    x7, x7, x3;                     \
        sbcs    x8, x8, xzr;                    \
        sbcs    x9, x9, xzr;                    \
        sbc     x10, x10, xzr;                  \
        and     x10, x10, #0x7fffffffffffffff;  \
        stp     x7, x8, [P0];                   \
        stp     x9, x10, [P0+16]

// A version of multiplication that only guarantees output < 2 * p_25519.
// This basically skips the +1 and final correction in quotient estimation.

#define mul_4(P0,P1,P2)                         \
        ldp     x3, x4, [P1];                   \
        ldp     x5, x6, [P2];                   \
        umull   x7, w3, w5;                     \
        lsr     x0, x3, #32;                    \
        umull   x15, w0, w5;                    \
        lsr     x16, x5, #32;                   \
        umull   x8, w16, w0;                    \
        umull   x16, w3, w16;                   \
        adds    x7, x7, x15, lsl #32;           \
        lsr     x15, x15, #32;                  \
        adc     x8, x8, x15;                    \
        adds    x7, x7, x16, lsl #32;           \
        lsr     x16, x16, #32;                  \
        adc     x8, x8, x16;                    \
        mul     x9, x4, x6;                     \
        umulh   x10, x4, x6;                    \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x16, cc;                        \
        adds    x9, x9, x8;                     \
        adc     x10, x10, xzr;                  \
        subs    x3, x5, x6;                     \
        cneg    x3, x3, cc;                     \
        cinv    x16, x16, cc;                   \
        mul     x15, x4, x3;                    \
        umulh   x3, x4, x3;                     \
        adds    x8, x7, x9;                     \
        adcs    x9, x9, x10;                    \
        adc     x10, x10, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x15, x15, x16;                  \
        adcs    x8, x15, x8;                    \
        eor     x3, x3, x16;                    \
        adcs    x9, x3, x9;                     \
        adc     x10, x10, x16;                  \
        ldp     x3, x4, [P1+16];                \
        ldp     x5, x6, [P2+16];                \
        umull   x11, w3, w5;                    \
        lsr     x0, x3, #32;                    \
        umull   x15, w0, w5;                    \
        lsr     x16, x5, #32;                   \
        umull   x12, w16, w0;                   \
        umull   x16, w3, w16;                   \
        adds    x11, x11, x15, lsl #32;         \
        lsr     x15, x15, #32;                  \
        adc     x12, x12, x15;                  \
        adds    x11, x11, x16, lsl #32;         \
        lsr     x16, x16, #32;                  \
        adc     x12, x12, x16;                  \
        mul     x13, x4, x6;                    \
        umulh   x14, x4, x6;                    \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x16, cc;                        \
        adds    x13, x13, x12;                  \
        adc     x14, x14, xzr;                  \
        subs    x3, x5, x6;                     \
        cneg    x3, x3, cc;                     \
        cinv    x16, x16, cc;                   \
        mul     x15, x4, x3;                    \
        umulh   x3, x4, x3;                     \
        adds    x12, x11, x13;                  \
        adcs    x13, x13, x14;                  \
        adc     x14, x14, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x15, x15, x16;                  \
        adcs    x12, x15, x12;                  \
        eor     x3, x3, x16;                    \
        adcs    x13, x3, x13;                   \
        adc     x14, x14, x16;                  \
        ldp     x3, x4, [P1+16];                \
        ldp     x15, x16, [P1];                 \
        subs    x3, x3, x15;                    \
        sbcs    x4, x4, x16;                    \
        csetm   x16, cc;                        \
        ldp     x15, x0, [P2];                  \
        subs    x5, x15, x5;                    \
        sbcs    x6, x0, x6;                     \
        csetm   x0, cc;                         \
        eor     x3, x3, x16;                    \
        subs    x3, x3, x16;                    \
        eor     x4, x4, x16;                    \
        sbc     x4, x4, x16;                    \
        eor     x5, x5, x0;                     \
        subs    x5, x5, x0;                     \
        eor     x6, x6, x0;                     \
        sbc     x6, x6, x0;                     \
        eor     x16, x0, x16;                   \
        adds    x11, x11, x9;                   \
        adcs    x12, x12, x10;                  \
        adcs    x13, x13, xzr;                  \
        adc     x14, x14, xzr;                  \
        mul     x2, x3, x5;                     \
        umulh   x0, x3, x5;                     \
        mul     x15, x4, x6;                    \
        umulh   x1, x4, x6;                     \
        subs    x4, x4, x3;                     \
        cneg    x4, x4, cc;                     \
        csetm   x9, cc;                         \
        adds    x15, x15, x0;                   \
        adc     x1, x1, xzr;                    \
        subs    x6, x5, x6;                     \
        cneg    x6, x6, cc;                     \
        cinv    x9, x9, cc;                     \
        mul     x5, x4, x6;                     \
        umulh   x6, x4, x6;                     \
        adds    x0, x2, x15;                    \
        adcs    x15, x15, x1;                   \
        adc     x1, x1, xzr;                    \
        cmn     x9, #0x1;                       \
        eor     x5, x5, x9;                     \
        adcs    x0, x5, x0;                     \
        eor     x6, x6, x9;                     \
        adcs    x15, x6, x15;                   \
        adc     x1, x1, x9;                     \
        adds    x9, x11, x7;                    \
        adcs    x10, x12, x8;                   \
        adcs    x11, x13, x11;                  \
        adcs    x12, x14, x12;                  \
        adcs    x13, x13, xzr;                  \
        adc     x14, x14, xzr;                  \
        cmn     x16, #0x1;                      \
        eor     x2, x2, x16;                    \
        adcs    x9, x2, x9;                     \
        eor     x0, x0, x16;                    \
        adcs    x10, x0, x10;                   \
        eor     x15, x15, x16;                  \
        adcs    x11, x15, x11;                  \
        eor     x1, x1, x16;                    \
        adcs    x12, x1, x12;                   \
        adcs    x13, x13, x16;                  \
        adc     x14, x14, x16;                  \
        mov     x3, #0x26;                      \
        umull   x4, w11, w3;                    \
        add     x4, x4, w7, uxtw;               \
        lsr     x7, x7, #32;                    \
        lsr     x11, x11, #32;                  \
        umaddl  x11, w11, w3, x7;               \
        mov     x7, x4;                         \
        umull   x4, w12, w3;                    \
        add     x4, x4, w8, uxtw;               \
        lsr     x8, x8, #32;                    \
        lsr     x12, x12, #32;                  \
        umaddl  x12, w12, w3, x8;               \
        mov     x8, x4;                         \
        umull   x4, w13, w3;                    \
        add     x4, x4, w9, uxtw;               \
        lsr     x9, x9, #32;                    \
        lsr     x13, x13, #32;                  \
        umaddl  x13, w13, w3, x9;               \
        mov     x9, x4;                         \
        umull   x4, w14, w3;                    \
        add     x4, x4, w10, uxtw;              \
        lsr     x10, x10, #32;                  \
        lsr     x14, x14, #32;                  \
        umaddl  x14, w14, w3, x10;              \
        mov     x10, x4;                        \
        lsr     x0, x14, #31;                   \
        mov     x5, #0x13;                      \
        umull   x5, w5, w0;                     \
        add     x7, x7, x5;                     \
        adds    x7, x7, x11, lsl #32;           \
        extr    x3, x12, x11, #32;              \
        adcs    x8, x8, x3;                     \
        extr    x3, x13, x12, #32;              \
        adcs    x9, x9, x3;                     \
        extr    x3, x14, x13, #32;              \
        lsl     x5, x0, #63;                    \
        eor     x10, x10, x5;                   \
        adc     x10, x10, x3;                   \
        stp     x7, x8, [P0];                   \
        stp     x9, x10, [P0+16]

// Squaring just giving a result < 2 * p_25519, which is done by
// basically skipping the +1 in the quotient estimate and the final
// optional correction.

#define sqr_4(P0,P1)                            \
        ldp     x10, x11, [P1];                 \
        ldp     x12, x13, [P1+16];              \
        umull   x2, w10, w10;                   \
        lsr     x14, x10, #32;                  \
        umull   x3, w14, w14;                   \
        umull   x14, w10, w14;                  \
        adds    x2, x2, x14, lsl #33;           \
        lsr     x14, x14, #31;                  \
        adc     x3, x3, x14;                    \
        umull   x4, w11, w11;                   \
        lsr     x14, x11, #32;                  \
        umull   x5, w14, w14;                   \
        umull   x14, w11, w14;                  \
        mul     x15, x10, x11;                  \
        umulh   x16, x10, x11;                  \
        adds    x4, x4, x14, lsl #33;           \
        lsr     x14, x14, #31;                  \
        adc     x5, x5, x14;                    \
        adds    x15, x15, x15;                  \
        adcs    x16, x16, x16;                  \
        adc     x5, x5, xzr;                    \
        adds    x3, x3, x15;                    \
        adcs    x4, x4, x16;                    \
        adc     x5, x5, xzr;                    \
        umull   x6, w12, w12;                   \
        lsr     x14, x12, #32;                  \
        umull   x7, w14, w14;                   \
        umull   x14, w12, w14;                  \
        adds    x6, x6, x14, lsl #33;           \
        lsr     x14, x14, #31;                  \
        adc     x7, x7, x14;                    \
        umull   x8, w13, w13;                   \
        lsr     x14, x13, #32;                  \
        umull   x9, w14, w14;                   \
        umull   x14, w13, w14;                  \
        mul     x15, x12, x13;                  \
        umulh   x16, x12, x13;                  \
        adds    x8, x8, x14, lsl #33;           \
        lsr     x14, x14, #31;                  \
        adc     x9, x9, x14;                    \
        adds    x15, x15, x15;                  \
        adcs    x16, x16, x16;                  \
        adc     x9, x9, xzr;                    \
        adds    x7, x7, x15;                    \
        adcs    x8, x8, x16;                    \
        adc     x9, x9, xzr;                    \
        subs    x10, x10, x12;                  \
        sbcs    x11, x11, x13;                  \
        csetm   x16, cc;                        \
        eor     x10, x10, x16;                  \
        subs    x10, x10, x16;                  \
        eor     x11, x11, x16;                  \
        sbc     x11, x11, x16;                  \
        adds    x6, x6, x4;                     \
        adcs    x7, x7, x5;                     \
        adcs    x8, x8, xzr;                    \
        adc     x9, x9, xzr;                    \
        umull   x12, w10, w10;                  \
        lsr     x5, x10, #32;                   \
        umull   x13, w5, w5;                    \
        umull   x5, w10, w5;                    \
        adds    x12, x12, x5, lsl #33;          \
        lsr     x5, x5, #31;                    \
        adc     x13, x13, x5;                   \
        umull   x15, w11, w11;                  \
        lsr     x5, x11, #32;                   \
        umull   x14, w5, w5;                    \
        umull   x5, w11, w5;                    \
        mul     x4, x10, x11;                   \
        umulh   x16, x10, x11;                  \
        adds    x15, x15, x5, lsl #33;          \
        lsr     x5, x5, #31;                    \
        adc     x14, x14, x5;                   \
        adds    x4, x4, x4;                     \
        adcs    x16, x16, x16;                  \
        adc     x14, x14, xzr;                  \
        adds    x13, x13, x4;                   \
        adcs    x15, x15, x16;                  \
        adc     x14, x14, xzr;                  \
        adds    x4, x2, x6;                     \
        adcs    x5, x3, x7;                     \
        adcs    x6, x6, x8;                     \
        adcs    x7, x7, x9;                     \
        csetm   x16, cc;                        \
        subs    x4, x4, x12;                    \
        sbcs    x5, x5, x13;                    \
        sbcs    x6, x6, x15;                    \
        sbcs    x7, x7, x14;                    \
        adcs    x8, x8, x16;                    \
        adc     x9, x9, x16;                    \
        mov     x10, #0x26;                     \
        umull   x12, w6, w10;                   \
        add     x12, x12, w2, uxtw;             \
        lsr     x2, x2, #32;                    \
        lsr     x6, x6, #32;                    \
        umaddl  x6, w6, w10, x2;                \
        mov     x2, x12;                        \
        umull   x12, w7, w10;                   \
        add     x12, x12, w3, uxtw;             \
        lsr     x3, x3, #32;                    \
        lsr     x7, x7, #32;                    \
        umaddl  x7, w7, w10, x3;                \
        mov     x3, x12;                        \
        umull   x12, w8, w10;                   \
        add     x12, x12, w4, uxtw;             \
        lsr     x4, x4, #32;                    \
        lsr     x8, x8, #32;                    \
        umaddl  x8, w8, w10, x4;                \
        mov     x4, x12;                        \
        umull   x12, w9, w10;                   \
        add     x12, x12, w5, uxtw;             \
        lsr     x5, x5, #32;                    \
        lsr     x9, x9, #32;                    \
        umaddl  x9, w9, w10, x5;                \
        mov     x5, x12;                        \
        lsr     x13, x9, #31;                   \
        mov     x11, #0x13;                     \
        umull   x11, w11, w13;                  \
        add     x2, x2, x11;                    \
        adds    x2, x2, x6, lsl #32;            \
        extr    x10, x7, x6, #32;               \
        adcs    x3, x3, x10;                    \
        extr    x10, x8, x7, #32;               \
        adcs    x4, x4, x10;                    \
        extr    x10, x9, x8, #32;               \
        lsl     x11, x13, #63;                  \
        eor     x5, x5, x11;                    \
        adc     x5, x5, x10;                    \
        stp     x2, x3, [P0];                   \
        stp     x4, x5, [P0+16]

// Modular addition with double modulus 2 * p_25519 = 2^256 - 38.
// This only ensures that the result fits in 4 digits, not that it is reduced
// even w.r.t. double modulus. The result is always correct modulo provided
// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided
// at least one of them is reduced double modulo.

#define add_twice4(P0,P1,P2)                    \
        ldp     x3, x4, [P1];                   \
        ldp     x7, x8, [P2];                   \
        adds    x3, x3, x7;                     \
        adcs    x4, x4, x8;                     \
        ldp     x5, x6, [P1+16];                \
        ldp     x7, x8, [P2+16];                \
        adcs    x5, x5, x7;                     \
        adcs    x6, x6, x8;                     \
        mov     x9, #38;                        \
        csel    x9, x9, xzr, cs;                \
        adds    x3, x3, x9;                     \
        adcs    x4, x4, xzr;                    \
        adcs    x5, x5, xzr;                    \
        adc     x6, x6, xzr;                    \
        stp     x3, x4, [P0];                   \
        stp     x5, x6, [P0+16]

// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38

#define sub_twice4(p0,p1,p2)                    \
        ldp     x5, x6, [p1];                   \
        ldp     x4, x3, [p2];                   \
        subs    x5, x5, x4;                     \
        sbcs    x6, x6, x3;                     \
        ldp     x7, x8, [p1+16];                \
        ldp     x4, x3, [p2+16];                \
        sbcs    x7, x7, x4;                     \
        sbcs    x8, x8, x3;                     \
        mov     x4, #38;                        \
        csel    x3, x4, xzr, lo;                \
        subs    x5, x5, x3;                     \
        sbcs    x6, x6, xzr;                    \
        sbcs    x7, x7, xzr;                    \
        sbc     x8, x8, xzr;                    \
        stp     x5, x6, [p0];                   \
        stp     x7, x8, [p0+16]

// Combined z = c * x + y with reduction only < 2 * p_25519
// where c is initially in the X1 register. It is assumed
// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a
// high mul in the final part.

#define cmadd_4(p0,p2,p3)                       \
        ldp     x7, x8, [p2];                   \
        ldp     x9, x10, [p2+16];               \
        mul     x3, x1, x7;                     \
        mul     x4, x1, x8;                     \
        mul     x5, x1, x9;                     \
        mul     x6, x1, x10;                    \
        umulh   x7, x1, x7;                     \
        umulh   x8, x1, x8;                     \
        umulh   x9, x1, x9;                     \
        umulh   x10, x1, x10;                   \
        adds    x4, x4, x7;                     \
        adcs    x5, x5, x8;                     \
        adcs    x6, x6, x9;                     \
        adc     x10, x10, xzr;                  \
        ldp     x7, x8, [p3];                   \
        adds    x3, x3, x7;                     \
        adcs    x4, x4, x8;                     \
        ldp     x7, x8, [p3+16];                \
        adcs    x5, x5, x7;                     \
        adcs    x6, x6, x8;                     \
        adc     x10, x10, xzr;                  \
        cmn     x6, x6;                         \
        bic     x6, x6, #0x8000000000000000;    \
        adc     x8, x10, x10;                   \
        mov     x9, #19;                        \
        mul     x7, x8, x9;                     \
        adds    x3, x3, x7;                     \
        adcs    x4, x4, xzr;                    \
        adcs    x5, x5, xzr;                    \
        adc     x6, x6, xzr;                    \
        stp     x3, x4, [p0];                   \
        stp     x5, x6, [p0+16]

// Multiplex: z := if NZ then x else y

#define mux_4(p0,p1,p2)                         \
        ldp     x0, x1, [p1];                   \
        ldp     x2, x3, [p2];                   \
        csel    x0, x0, x2, ne;                 \
        csel    x1, x1, x3, ne;                 \
        stp     x0, x1, [p0];                   \
        ldp     x0, x1, [p1+16];                \
        ldp     x2, x3, [p2+16];                \
        csel    x0, x0, x2, ne;                 \
        csel    x1, x1, x3, ne;                 \
        stp     x0, x1, [p0+16]

S2N_BN_SYMBOL(curve25519_x25519_byte):

// Save regs and make room for temporaries

        stp     x19, x20, [sp, -16]!
        stp     x21, x22, [sp, -16]!
        stp     x23, x24, [sp, -16]!
        sub     sp, sp, #NSPACE

// Move the output pointer to a stable place

        mov     res, x0

// Copy the inputs to the local variables with minimal mangling:
//
//  - The scalar is in principle turned into 01xxx...xxx000 but
//    in the structure below the special handling of these bits is
//    explicit in the main computation; the scalar is just copied.
//
//  - The point x coord is reduced mod 2^255 by masking off the
//    top bit. In the main loop we only need reduction < 2 * p_25519.

        ldrb    w10, [x1]
        ldrb    w0, [x1, #1]
        orr     x10, x10, x0, lsl #8
        ldrb    w0, [x1, #2]
        orr     x10, x10, x0, lsl #16
        ldrb    w0, [x1, #3]
        orr     x10, x10, x0, lsl #24
        ldrb    w0, [x1, #4]
        orr     x10, x10, x0, lsl #32
        ldrb    w0, [x1, #5]
        orr     x10, x10, x0, lsl #40
        ldrb    w0, [x1, #6]
        orr     x10, x10, x0, lsl #48
        ldrb    w0, [x1, #7]
        orr     x10, x10, x0, lsl #56
        ldrb    w11, [x1, #8]
        ldrb    w0, [x1, #9]
        orr     x11, x11, x0, lsl #8
        ldrb    w0, [x1, #10]
        orr     x11, x11, x0, lsl #16
        ldrb    w0, [x1, #11]
        orr     x11, x11, x0, lsl #24
        ldrb    w0, [x1, #12]
        orr     x11, x11, x0, lsl #32
        ldrb    w0, [x1, #13]
        orr     x11, x11, x0, lsl #40
        ldrb    w0, [x1, #14]
        orr     x11, x11, x0, lsl #48
        ldrb    w0, [x1, #15]
        orr     x11, x11, x0, lsl #56
        stp     x10, x11, [scalar]

        ldrb    w12, [x1, #16]
        ldrb    w0, [x1, #17]
        orr     x12, x12, x0, lsl #8
        ldrb    w0, [x1, #18]
        orr     x12, x12, x0, lsl #16
        ldrb    w0, [x1, #19]
        orr     x12, x12, x0, lsl #24
        ldrb    w0, [x1, #20]
        orr     x12, x12, x0, lsl #32
        ldrb    w0, [x1, #21]
        orr     x12, x12, x0, lsl #40
        ldrb    w0, [x1, #22]
        orr     x12, x12, x0, lsl #48
        ldrb    w0, [x1, #23]
        orr     x12, x12, x0, lsl #56
        ldrb    w13, [x1, #24]
        ldrb    w0, [x1, #25]
        orr     x13, x13, x0, lsl #8
        ldrb    w0, [x1, #26]
        orr     x13, x13, x0, lsl #16
        ldrb    w0, [x1, #27]
        orr     x13, x13, x0, lsl #24
        ldrb    w0, [x1, #28]
        orr     x13, x13, x0, lsl #32
        ldrb    w0, [x1, #29]
        orr     x13, x13, x0, lsl #40
        ldrb    w0, [x1, #30]
        orr     x13, x13, x0, lsl #48
        ldrb    w0, [x1, #31]
        orr     x13, x13, x0, lsl #56
        stp     x12, x13, [scalar+16]

        ldrb    w10, [x2]
        ldrb    w0, [x2, #1]
        orr     x10, x10, x0, lsl #8
        ldrb    w0, [x2, #2]
        orr     x10, x10, x0, lsl #16
        ldrb    w0, [x2, #3]
        orr     x10, x10, x0, lsl #24
        ldrb    w0, [x2, #4]
        orr     x10, x10, x0, lsl #32
        ldrb    w0, [x2, #5]
        orr     x10, x10, x0, lsl #40
        ldrb    w0, [x2, #6]
        orr     x10, x10, x0, lsl #48
        ldrb    w0, [x2, #7]
        orr     x10, x10, x0, lsl #56
        ldrb    w11, [x2, #8]
        ldrb    w0, [x2, #9]
        orr     x11, x11, x0, lsl #8
        ldrb    w0, [x2, #10]
        orr     x11, x11, x0, lsl #16
        ldrb    w0, [x2, #11]
        orr     x11, x11, x0, lsl #24
        ldrb    w0, [x2, #12]
        orr     x11, x11, x0, lsl #32
        ldrb    w0, [x2, #13]
        orr     x11, x11, x0, lsl #40
        ldrb    w0, [x2, #14]
        orr     x11, x11, x0, lsl #48
        ldrb    w0, [x2, #15]
        orr     x11, x11, x0, lsl #56
        stp     x10, x11, [pointx]

        ldrb    w12, [x2, #16]
        ldrb    w0, [x2, #17]
        orr     x12, x12, x0, lsl #8
        ldrb    w0, [x2, #18]
        orr     x12, x12, x0, lsl #16
        ldrb    w0, [x2, #19]
        orr     x12, x12, x0, lsl #24
        ldrb    w0, [x2, #20]
        orr     x12, x12, x0, lsl #32
        ldrb    w0, [x2, #21]
        orr     x12, x12, x0, lsl #40
        ldrb    w0, [x2, #22]
        orr     x12, x12, x0, lsl #48
        ldrb    w0, [x2, #23]
        orr     x12, x12, x0, lsl #56
        ldrb    w13, [x2, #24]
        ldrb    w0, [x2, #25]
        orr     x13, x13, x0, lsl #8
        ldrb    w0, [x2, #26]
        orr     x13, x13, x0, lsl #16
        ldrb    w0, [x2, #27]
        orr     x13, x13, x0, lsl #24
        ldrb    w0, [x2, #28]
        orr     x13, x13, x0, lsl #32
        ldrb    w0, [x2, #29]
        orr     x13, x13, x0, lsl #40
        ldrb    w0, [x2, #30]
        orr     x13, x13, x0, lsl #48
        ldrb    w0, [x2, #31]
        orr     x13, x13, x0, lsl #56
        and     x13, x13, #0x7fffffffffffffff
        stp     x12, x13, [pointx+16]

// Initialize with explicit doubling in order to handle set bit 254.
// Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1).
// We use the fact that the point x coordinate is still in registers.
// Since zm = 1 we could do the doubling with an operation count of
// 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth
// the slight complication arising from a different linear combination.

        mov     swap, #1
        stp     x10, x11, [xm]
        stp     x12, x13, [xm+16]
        stp     swap, xzr, [zm]
        stp     xzr, xzr, [zm+16]

        sub_twice4(d,xm,zm)
        add_twice4(s,xm,zm)
        sqr_4(d,d)
        sqr_4(s,s)
        sub_twice4(p,s,d)
        mov     x1, 0xdb42
        orr     x1, x1, 0x10000
        cmadd_4(e,p,d)
        mul_4(xn,s,d)
        mul_4(zn,p,e)

// The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive).
// This is a classic Montgomery ladder, with the main coordinates only
// reduced mod 2 * p_25519, some intermediate results even more loosely.

        mov     i, #253

curve25519_x25519_byte_scalarloop:

// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn

        sub_twice4(dm,xm,zm)
        add_twice4(sn,xn,zn)
        sub_twice4(dn,xn,zn)
        add_twice4(sm,xm,zm)

// ADDING: dmsn = dm * sn
// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt)

        mul_4(dmsn,sn,dm)

        lsr     x0, i, #6
        ldr     x2, [sp, x0, lsl #3]    // Exploiting scalar = sp exactly
        lsr     x2, x2, i
        and     x2, x2, #1

        cmp     swap, x2
        mov     swap, x2

        mux_4(d,dm,dn)
        mux_4(s,sm,sn)

// ADDING: dnsm = sm * dn

        mul_4(dnsm,sm,dn)

// DOUBLING: d = (xt - zt)^2

        sqr_4(d,d)

// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2
// DOUBLING: s = (xt + zt)^2

        sub_twice4(dpro,dmsn,dnsm)
        sqr_4(s,s)
        add_twice4(spro,dmsn,dnsm)
        sqr_4(dpro,dpro)

// DOUBLING: p = 4 * xt * zt = s - d

        sub_twice4(p,s,d)

// ADDING: xm' = (dmsn + dnsm)^2

        sqr_4(xm,spro)

// DOUBLING: e = 121666 * p + d

        mov     x1, 0xdb42
        orr     x1, x1, 0x10000
        cmadd_4(e,p,d)

// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d

        mul_4(xn,s,d)

// ADDING: zm' = x * (dmsn - dnsm)^2

        mul_4(zm,dpro,pointx)

// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt))
//               = p * (d + 121666 * p)

        mul_4(zn,p,e)

// Loop down as far as 3 (inclusive)

        sub     i, i, #1
        cmp     i, #3
        bcs     curve25519_x25519_byte_scalarloop

// Multiplex directly into (xn,zn) then do three pure doubling steps;
// this accounts for the implicit zeroing of the three lowest bits
// of the scalar. On the very last doubling we *fully* reduce zn mod
// p_25519 to ease checking for degeneracy below.

        cmp     swap, xzr
        mux_4(xn,xm,xn)
        mux_4(zn,zm,zn)

        sub_twice4(d,xn,zn)
        add_twice4(s,xn,zn)
        sqr_4(d,d)
        sqr_4(s,s)
        sub_twice4(p,s,d)
        mov     x1, 0xdb42
        orr     x1, x1, 0x10000
        cmadd_4(e,p,d)
        mul_4(xn,s,d)
        mul_4(zn,p,e)

        sub_twice4(d,xn,zn)
        add_twice4(s,xn,zn)
        sqr_4(d,d)
        sqr_4(s,s)
        sub_twice4(p,s,d)
        mov     x1, 0xdb42
        orr     x1, x1, 0x10000
        cmadd_4(e,p,d)
        mul_4(xn,s,d)
        mul_4(zn,p,e)

        sub_twice4(d,xn,zn)
        add_twice4(s,xn,zn)
        sqr_4(d,d)
        sqr_4(s,s)
        sub_twice4(p,s,d)
        mov     x1, 0xdb42
        orr     x1, x1, 0x10000
        cmadd_4(e,p,d)
        mul_4(xn,s,d)
        mul_p25519(zn,p,e)

// The projective result of the scalar multiplication is now (xn,zn).
// First set up the constant sn = 2^255 - 19 for the modular inverse.

        mov     x0, #-19
        mov     x1, #-1
        mov     x2, #0x7fffffffffffffff
        stp     x0, x1, [sn]
        stp     x1, x2, [sn+16]

// Prepare to call the modular inverse function to get zm = 1/zn

        mov     x0, #4
        add     x1, zm
        add     x2, zn
        add     x3, sn
        add     x4, p

// Inline copy of bignum_modinv, identical except for stripping out the
// prologue and epilogue saving and restoring registers and the initial
// test for k = 0 (which is trivially false here since k = 4). For more
// details and explanations see "arm/generic/bignum_modinv.S".

        lsl     x10, x0, #3
        add     x21, x4, x10
        add     x22, x21, x10
        mov     x10, xzr
curve25519_x25519_byte_copyloop:
        ldr     x11, [x2, x10, lsl #3]
        ldr     x12, [x3, x10, lsl #3]
        str     x11, [x21, x10, lsl #3]
        str     x12, [x22, x10, lsl #3]
        str     x12, [x4, x10, lsl #3]
        str     xzr, [x1, x10, lsl #3]
        add     x10, x10, #0x1
        cmp     x10, x0
        b.cc    curve25519_x25519_byte_copyloop
        ldr     x11, [x4]
        sub     x12, x11, #0x1
        str     x12, [x4]
        lsl     x20, x11, #2
        sub     x20, x11, x20
        eor     x20, x20, #0x2
        mov     x12, #0x1
        madd    x12, x11, x20, x12
        mul     x11, x12, x12
        madd    x20, x12, x20, x20
        mul     x12, x11, x11
        madd    x20, x11, x20, x20
        mul     x11, x12, x12
        madd    x20, x12, x20, x20
        madd    x20, x11, x20, x20
        lsl     x2, x0, #7
curve25519_x25519_byte_outerloop:
        add     x10, x2, #0x3f
        lsr     x5, x10, #6
        cmp     x5, x0
        csel    x5, x0, x5, cs
        mov     x13, xzr
        mov     x15, xzr
        mov     x14, xzr
        mov     x16, xzr
        mov     x19, xzr
        mov     x10, xzr
curve25519_x25519_byte_toploop:
        ldr     x11, [x21, x10, lsl #3]
        ldr     x12, [x22, x10, lsl #3]
        orr     x17, x11, x12
        cmp     x17, xzr
        and     x17, x19, x13
        csel    x15, x17, x15, ne
        and     x17, x19, x14
        csel    x16, x17, x16, ne
        csel    x13, x11, x13, ne
        csel    x14, x12, x14, ne
        csetm   x19, ne
        add     x10, x10, #0x1
        cmp     x10, x5
        b.cc    curve25519_x25519_byte_toploop
        orr     x11, x13, x14
        clz     x12, x11
        negs    x17, x12
        lsl     x13, x13, x12
        csel    x15, x15, xzr, ne
        lsl     x14, x14, x12
        csel    x16, x16, xzr, ne
        lsr     x15, x15, x17
        lsr     x16, x16, x17
        orr     x13, x13, x15
        orr     x14, x14, x16
        ldr     x15, [x21]
        ldr     x16, [x22]
        mov     x6, #0x1
        mov     x7, xzr
        mov     x8, xzr
        mov     x9, #0x1
        mov     x10, #0x3a
        tst     x15, #0x1
curve25519_x25519_byte_innerloop:
        csel    x11, x14, xzr, ne
        csel    x12, x16, xzr, ne
        csel    x17, x8, xzr, ne
        csel    x19, x9, xzr, ne
        ccmp    x13, x14, #0x2, ne
        sub     x11, x13, x11
        sub     x12, x15, x12
        csel    x14, x14, x13, cs
        cneg    x11, x11, cc
        csel    x16, x16, x15, cs
        cneg    x15, x12, cc
        csel    x8, x8, x6, cs
        csel    x9, x9, x7, cs
        tst     x12, #0x2
        add     x6, x6, x17
        add     x7, x7, x19
        lsr     x13, x11, #1
        lsr     x15, x15, #1
        add     x8, x8, x8
        add     x9, x9, x9
        sub     x10, x10, #0x1
        cbnz    x10, curve25519_x25519_byte_innerloop
        mov     x13, xzr
        mov     x14, xzr
        mov     x17, xzr
        mov     x19, xzr
        mov     x10, xzr
curve25519_x25519_byte_congloop:
        ldr     x11, [x4, x10, lsl #3]
        ldr     x12, [x1, x10, lsl #3]
        mul     x15, x6, x11
        mul     x16, x7, x12
        adds    x15, x15, x13
        umulh   x13, x6, x11
        adc     x13, x13, xzr
        adds    x15, x15, x16
        extr    x17, x15, x17, #58
        str     x17, [x4, x10, lsl #3]
        mov     x17, x15
        umulh   x15, x7, x12
        adc     x13, x13, x15
        mul     x15, x8, x11
        mul     x16, x9, x12
        adds    x15, x15, x14
        umulh   x14, x8, x11
        adc     x14, x14, xzr
        adds    x15, x15, x16
        extr    x19, x15, x19, #58
        str     x19, [x1, x10, lsl #3]
        mov     x19, x15
        umulh   x15, x9, x12
        adc     x14, x14, x15
        add     x10, x10, #0x1
        cmp     x10, x0
        b.cc    curve25519_x25519_byte_congloop
        extr    x13, x13, x17, #58
        extr    x14, x14, x19, #58
        ldr     x11, [x4]
        mul     x17, x11, x20
        ldr     x12, [x3]
        mul     x15, x17, x12
        umulh   x16, x17, x12
        adds    x11, x11, x15
        mov     x10, #0x1
        sub     x11, x0, #0x1
        cbz     x11, curve25519_x25519_byte_wmontend
curve25519_x25519_byte_wmontloop:
        ldr     x11, [x3, x10, lsl #3]
        ldr     x12, [x4, x10, lsl #3]
        mul     x15, x17, x11
        adcs    x12, x12, x16
        umulh   x16, x17, x11
        adc     x16, x16, xzr
        adds    x12, x12, x15
        sub     x15, x10, #0x1
        str     x12, [x4, x15, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_wmontloop
curve25519_x25519_byte_wmontend:
        adcs    x16, x16, x13
        adc     x13, xzr, xzr
        sub     x15, x10, #0x1
        str     x16, [x4, x15, lsl #3]
        negs    x10, xzr
curve25519_x25519_byte_wcmploop:
        ldr     x11, [x4, x10, lsl #3]
        ldr     x12, [x3, x10, lsl #3]
        sbcs    xzr, x11, x12
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_wcmploop
        sbcs    xzr, x13, xzr
        csetm   x13, cs
        negs    x10, xzr
curve25519_x25519_byte_wcorrloop:
        ldr     x11, [x4, x10, lsl #3]
        ldr     x12, [x3, x10, lsl #3]
        and     x12, x12, x13
        sbcs    x11, x11, x12
        str     x11, [x4, x10, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_wcorrloop
        ldr     x11, [x1]
        mul     x17, x11, x20
        ldr     x12, [x3]
        mul     x15, x17, x12
        umulh   x16, x17, x12
        adds    x11, x11, x15
        mov     x10, #0x1
        sub     x11, x0, #0x1
        cbz     x11, curve25519_x25519_byte_zmontend
curve25519_x25519_byte_zmontloop:
        ldr     x11, [x3, x10, lsl #3]
        ldr     x12, [x1, x10, lsl #3]
        mul     x15, x17, x11
        adcs    x12, x12, x16
        umulh   x16, x17, x11
        adc     x16, x16, xzr
        adds    x12, x12, x15
        sub     x15, x10, #0x1
        str     x12, [x1, x15, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_zmontloop
curve25519_x25519_byte_zmontend:
        adcs    x16, x16, x14
        adc     x14, xzr, xzr
        sub     x15, x10, #0x1
        str     x16, [x1, x15, lsl #3]
        negs    x10, xzr
curve25519_x25519_byte_zcmploop:
        ldr     x11, [x1, x10, lsl #3]
        ldr     x12, [x3, x10, lsl #3]
        sbcs    xzr, x11, x12
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_zcmploop
        sbcs    xzr, x14, xzr
        csetm   x14, cs
        negs    x10, xzr
curve25519_x25519_byte_zcorrloop:
        ldr     x11, [x1, x10, lsl #3]
        ldr     x12, [x3, x10, lsl #3]
        and     x12, x12, x14
        sbcs    x11, x11, x12
        str     x11, [x1, x10, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_zcorrloop
        mov     x13, xzr
        mov     x14, xzr
        mov     x17, xzr
        mov     x19, xzr
        mov     x10, xzr
curve25519_x25519_byte_crossloop:
        ldr     x11, [x21, x10, lsl #3]
        ldr     x12, [x22, x10, lsl #3]
        mul     x15, x6, x11
        mul     x16, x7, x12
        adds    x15, x15, x13
        umulh   x13, x6, x11
        adc     x13, x13, xzr
        subs    x15, x15, x16
        str     x15, [x21, x10, lsl #3]
        umulh   x15, x7, x12
        sub     x17, x15, x17
        sbcs    x13, x13, x17
        csetm   x17, cc
        mul     x15, x8, x11
        mul     x16, x9, x12
        adds    x15, x15, x14
        umulh   x14, x8, x11
        adc     x14, x14, xzr
        subs    x15, x15, x16
        str     x15, [x22, x10, lsl #3]
        umulh   x15, x9, x12
        sub     x19, x15, x19
        sbcs    x14, x14, x19
        csetm   x19, cc
        add     x10, x10, #0x1
        cmp     x10, x5
        b.cc    curve25519_x25519_byte_crossloop
        cmn     x17, x17
        ldr     x15, [x21]
        mov     x10, xzr
        sub     x6, x5, #0x1
        cbz     x6, curve25519_x25519_byte_negskip1
curve25519_x25519_byte_negloop1:
        add     x11, x10, #0x8
        ldr     x12, [x21, x11]
        extr    x15, x12, x15, #58
        eor     x15, x15, x17
        adcs    x15, x15, xzr
        str     x15, [x21, x10]
        mov     x15, x12
        add     x10, x10, #0x8
        sub     x6, x6, #0x1
        cbnz    x6, curve25519_x25519_byte_negloop1
curve25519_x25519_byte_negskip1:
        extr    x15, x13, x15, #58
        eor     x15, x15, x17
        adcs    x15, x15, xzr
        str     x15, [x21, x10]
        cmn     x19, x19
        ldr     x15, [x22]
        mov     x10, xzr
        sub     x6, x5, #0x1
        cbz     x6, curve25519_x25519_byte_negskip2
curve25519_x25519_byte_negloop2:
        add     x11, x10, #0x8
        ldr     x12, [x22, x11]
        extr    x15, x12, x15, #58
        eor     x15, x15, x19
        adcs    x15, x15, xzr
        str     x15, [x22, x10]
        mov     x15, x12
        add     x10, x10, #0x8
        sub     x6, x6, #0x1
        cbnz    x6, curve25519_x25519_byte_negloop2
curve25519_x25519_byte_negskip2:
        extr    x15, x14, x15, #58
        eor     x15, x15, x19
        adcs    x15, x15, xzr
        str     x15, [x22, x10]
        mov     x10, xzr
        cmn     x17, x17
curve25519_x25519_byte_wfliploop:
        ldr     x11, [x3, x10, lsl #3]
        ldr     x12, [x4, x10, lsl #3]
        and     x11, x11, x17
        eor     x12, x12, x17
        adcs    x11, x11, x12
        str     x11, [x4, x10, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_wfliploop
        mvn     x19, x19
        mov     x10, xzr
        cmn     x19, x19
curve25519_x25519_byte_zfliploop:
        ldr     x11, [x3, x10, lsl #3]
        ldr     x12, [x1, x10, lsl #3]
        and     x11, x11, x19
        eor     x12, x12, x19
        adcs    x11, x11, x12
        str     x11, [x1, x10, lsl #3]
        add     x10, x10, #0x1
        sub     x11, x10, x0
        cbnz    x11, curve25519_x25519_byte_zfliploop
        subs    x2, x2, #0x3a
        b.hi    curve25519_x25519_byte_outerloop

// Since we eventually want to return 0 when the result is the point at
// infinity, we force xn = 0 whenever zn = 0. This avoids building in a
// dependency on the behavior of modular inverse in out-of-scope cases.

        ldp     x0, x1, [zn]
        ldp     x2, x3, [zn+16]
        orr     x0, x0, x1
        orr     x2, x2, x3
        orr     x4, x0, x2
        cmp     x4, xzr
        ldp     x0, x1, [xn]
        csel    x0, x0, xzr, ne
        csel    x1, x1, xzr, ne
        ldp     x2, x3, [xn+16]
        stp     x0, x1, [xn]
        csel    x2, x2, xzr, ne
        csel    x3, x3, xzr, ne
        stp     x2, x3, [xn+16]

// Now the result is xn * (1/zn), fully reduced modulo p.

        mul_p25519(zn,xn,zm)

        ldp     x10, x11, [zn]
        strb    w10, [resx]
        lsr     x10, x10, #8
        strb    w10, [resx+1]
        lsr     x10, x10, #8
        strb    w10, [resx+2]
        lsr     x10, x10, #8
        strb    w10, [resx+3]
        lsr     x10, x10, #8
        strb    w10, [resx+4]
        lsr     x10, x10, #8
        strb    w10, [resx+5]
        lsr     x10, x10, #8
        strb    w10, [resx+6]
        lsr     x10, x10, #8
        strb    w10, [resx+7]

        strb    w11, [resx+8]
        lsr     x11, x11, #8
        strb    w11, [resx+9]
        lsr     x11, x11, #8
        strb    w11, [resx+10]
        lsr     x11, x11, #8
        strb    w11, [resx+11]
        lsr     x11, x11, #8
        strb    w11, [resx+12]
        lsr     x11, x11, #8
        strb    w11, [resx+13]
        lsr     x11, x11, #8
        strb    w11, [resx+14]
        lsr     x11, x11, #8
        strb    w11, [resx+15]

        ldp     x12, x13, [zn+16]
        strb    w12, [resx+16]
        lsr     x12, x12, #8
        strb    w12, [resx+17]
        lsr     x12, x12, #8
        strb    w12, [resx+18]
        lsr     x12, x12, #8
        strb    w12, [resx+19]
        lsr     x12, x12, #8
        strb    w12, [resx+20]
        lsr     x12, x12, #8
        strb    w12, [resx+21]
        lsr     x12, x12, #8
        strb    w12, [resx+22]
        lsr     x12, x12, #8
        strb    w12, [resx+23]

        strb    w13, [resx+24]
        lsr     x13, x13, #8
        strb    w13, [resx+25]
        lsr     x13, x13, #8
        strb    w13, [resx+26]
        lsr     x13, x13, #8
        strb    w13, [resx+27]
        lsr     x13, x13, #8
        strb    w13, [resx+28]
        lsr     x13, x13, #8
        strb    w13, [resx+29]
        lsr     x13, x13, #8
        strb    w13, [resx+30]
        lsr     x13, x13, #8
        strb    w13, [resx+31]

// Restore stack and registers

        add     sp, sp, #NSPACE
        ldp     x23, x24, [sp], 16
        ldp     x21, x22, [sp], 16
        ldp     x19, x20, [sp], 16

        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack, "", %progbits
#endif