// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // The x25519 function for curve25519 // Inputs scalar[4], point[4]; output res[4] // // extern void curve25519_x25519_alt // (uint64_t res[static 4],uint64_t scalar[static 4],uint64_t point[static 4]) // // Given a scalar n and the X coordinate of an input point P = (X,Y) on // curve25519 (Y can live in any extension field of characteristic 2^255-19), // this returns the X coordinate of n * P = (X, Y), or 0 when n * P is the // point at infinity. Both n and X inputs are first slightly modified/mangled // as specified in the relevant RFC (https://www.rfc-editor.org/rfc/rfc7748); // in particular the lower three bits of n are set to zero. // // Standard ARM ABI: X0 = res, X1 = scalar, X2 = point // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_x25519_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_x25519_alt) .text .balign 4 // Size of individual field elements #define NUMSIZE 32 // Stable homes for the input result argument during the whole body // and other variables that are only needed prior to the modular inverse. #define res x23 #define i x20 #define swap x21 // Pointers to result x coord to be written #define resx res, #0 // Pointer-offset pairs for temporaries on stack with some aliasing. #define scalar sp, #(0*NUMSIZE) #define pointx sp, #(1*NUMSIZE) #define zm sp, #(2*NUMSIZE) #define sm sp, #(2*NUMSIZE) #define dpro sp, #(2*NUMSIZE) #define sn sp, #(3*NUMSIZE) #define dm sp, #(4*NUMSIZE) #define zn sp, #(5*NUMSIZE) #define dn sp, #(5*NUMSIZE) #define e sp, #(5*NUMSIZE) #define dmsn sp, #(6*NUMSIZE) #define p sp, #(6*NUMSIZE) #define xm sp, #(7*NUMSIZE) #define dnsm sp, #(7*NUMSIZE) #define spro sp, #(7*NUMSIZE) #define d sp, #(8*NUMSIZE) #define xn sp, #(9*NUMSIZE) #define s sp, #(9*NUMSIZE) // Total size to reserve on the stack #define NSPACE (10*NUMSIZE) // Macro wrapping up the basic field operation bignum_mul_p25519_alt, only // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ ldp x3, x4, [P1]; \ ldp x7, x8, [P2]; \ mul x12, x3, x7; \ umulh x13, x3, x7; \ mul x11, x3, x8; \ umulh x14, x3, x8; \ adds x13, x13, x11; \ ldp x9, x10, [P2+16]; \ mul x11, x3, x9; \ umulh x15, x3, x9; \ adcs x14, x14, x11; \ mul x11, x3, x10; \ umulh x16, x3, x10; \ adcs x15, x15, x11; \ adc x16, x16, xzr; \ ldp x5, x6, [P1+16]; \ mul x11, x4, x7; \ adds x13, x13, x11; \ mul x11, x4, x8; \ adcs x14, x14, x11; \ mul x11, x4, x9; \ adcs x15, x15, x11; \ mul x11, x4, x10; \ adcs x16, x16, x11; \ umulh x3, x4, x10; \ adc x3, x3, xzr; \ umulh x11, x4, x7; \ adds x14, x14, x11; \ umulh x11, x4, x8; \ adcs x15, x15, x11; \ umulh x11, x4, x9; \ adcs x16, x16, x11; \ adc x3, x3, xzr; \ mul x11, x5, x7; \ adds x14, x14, x11; \ mul x11, x5, x8; \ adcs x15, x15, x11; \ mul x11, x5, x9; \ adcs x16, x16, x11; \ mul x11, x5, x10; \ adcs x3, x3, x11; \ umulh x4, x5, x10; \ adc x4, x4, xzr; \ umulh x11, x5, x7; \ adds x15, x15, x11; \ umulh x11, x5, x8; \ adcs x16, x16, x11; \ umulh x11, x5, x9; \ adcs x3, x3, x11; \ adc x4, x4, xzr; \ mul x11, x6, x7; \ adds x15, x15, x11; \ mul x11, x6, x8; \ adcs x16, x16, x11; \ mul x11, x6, x9; \ adcs x3, x3, x11; \ mul x11, x6, x10; \ adcs x4, x4, x11; \ umulh x5, x6, x10; \ adc x5, x5, xzr; \ umulh x11, x6, x7; \ adds x16, x16, x11; \ umulh x11, x6, x8; \ adcs x3, x3, x11; \ umulh x11, x6, x9; \ adcs x4, x4, x11; \ adc x5, x5, xzr; \ mov x7, #0x26; \ mul x11, x7, x16; \ umulh x9, x7, x16; \ adds x12, x12, x11; \ mul x11, x7, x3; \ umulh x3, x7, x3; \ adcs x13, x13, x11; \ mul x11, x7, x4; \ umulh x4, x7, x4; \ adcs x14, x14, x11; \ mul x11, x7, x5; \ umulh x5, x7, x5; \ adcs x15, x15, x11; \ cset x16, cs; \ adds x15, x15, x4; \ adc x16, x16, x5; \ cmn x15, x15; \ orr x15, x15, #0x8000000000000000; \ adc x8, x16, x16; \ mov x7, #0x13; \ madd x11, x7, x8, x7; \ adds x12, x12, x11; \ adcs x13, x13, x9; \ adcs x14, x14, x3; \ adcs x15, x15, xzr; \ csel x7, x7, xzr, cc; \ subs x12, x12, x7; \ sbcs x13, x13, xzr; \ sbcs x14, x14, xzr; \ sbc x15, x15, xzr; \ and x15, x15, #0x7fffffffffffffff; \ stp x12, x13, [P0]; \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ ldp x3, x4, [P1]; \ ldp x7, x8, [P2]; \ mul x12, x3, x7; \ umulh x13, x3, x7; \ mul x11, x3, x8; \ umulh x14, x3, x8; \ adds x13, x13, x11; \ ldp x9, x10, [P2+16]; \ mul x11, x3, x9; \ umulh x15, x3, x9; \ adcs x14, x14, x11; \ mul x11, x3, x10; \ umulh x16, x3, x10; \ adcs x15, x15, x11; \ adc x16, x16, xzr; \ ldp x5, x6, [P1+16]; \ mul x11, x4, x7; \ adds x13, x13, x11; \ mul x11, x4, x8; \ adcs x14, x14, x11; \ mul x11, x4, x9; \ adcs x15, x15, x11; \ mul x11, x4, x10; \ adcs x16, x16, x11; \ umulh x3, x4, x10; \ adc x3, x3, xzr; \ umulh x11, x4, x7; \ adds x14, x14, x11; \ umulh x11, x4, x8; \ adcs x15, x15, x11; \ umulh x11, x4, x9; \ adcs x16, x16, x11; \ adc x3, x3, xzr; \ mul x11, x5, x7; \ adds x14, x14, x11; \ mul x11, x5, x8; \ adcs x15, x15, x11; \ mul x11, x5, x9; \ adcs x16, x16, x11; \ mul x11, x5, x10; \ adcs x3, x3, x11; \ umulh x4, x5, x10; \ adc x4, x4, xzr; \ umulh x11, x5, x7; \ adds x15, x15, x11; \ umulh x11, x5, x8; \ adcs x16, x16, x11; \ umulh x11, x5, x9; \ adcs x3, x3, x11; \ adc x4, x4, xzr; \ mul x11, x6, x7; \ adds x15, x15, x11; \ mul x11, x6, x8; \ adcs x16, x16, x11; \ mul x11, x6, x9; \ adcs x3, x3, x11; \ mul x11, x6, x10; \ adcs x4, x4, x11; \ umulh x5, x6, x10; \ adc x5, x5, xzr; \ umulh x11, x6, x7; \ adds x16, x16, x11; \ umulh x11, x6, x8; \ adcs x3, x3, x11; \ umulh x11, x6, x9; \ adcs x4, x4, x11; \ adc x5, x5, xzr; \ mov x7, #0x26; \ mul x11, x7, x16; \ umulh x9, x7, x16; \ adds x12, x12, x11; \ mul x11, x7, x3; \ umulh x3, x7, x3; \ adcs x13, x13, x11; \ mul x11, x7, x4; \ umulh x4, x7, x4; \ adcs x14, x14, x11; \ mul x11, x7, x5; \ umulh x5, x7, x5; \ adcs x15, x15, x11; \ cset x16, cs; \ adds x15, x15, x4; \ adc x16, x16, x5; \ cmn x15, x15; \ bic x15, x15, #0x8000000000000000; \ adc x8, x16, x16; \ mov x7, #0x13; \ mul x11, x7, x8; \ adds x12, x12, x11; \ adcs x13, x13, x9; \ adcs x14, x14, x3; \ adc x15, x15, xzr; \ stp x12, x13, [P0]; \ stp x14, x15, [P0+16] // Squaring just giving a result < 2 * p_25519, which is done by // basically skipping the +1 in the quotient estimate and the final // optional correction. #define sqr_4(P0,P1) \ ldp x2, x3, [P1]; \ mul x9, x2, x3; \ umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ mul x11, x2, x5; \ umulh x12, x2, x5; \ mul x7, x2, x4; \ umulh x6, x2, x4; \ adds x10, x10, x7; \ adcs x11, x11, x6; \ mul x7, x3, x4; \ umulh x6, x3, x4; \ adc x6, x6, xzr; \ adds x11, x11, x7; \ mul x13, x4, x5; \ umulh x14, x4, x5; \ adcs x12, x12, x6; \ mul x7, x3, x5; \ umulh x6, x3, x5; \ adc x6, x6, xzr; \ adds x12, x12, x7; \ adcs x13, x13, x6; \ adc x14, x14, xzr; \ adds x9, x9, x9; \ adcs x10, x10, x10; \ adcs x11, x11, x11; \ adcs x12, x12, x12; \ adcs x13, x13, x13; \ adcs x14, x14, x14; \ cset x6, cs; \ umulh x7, x2, x2; \ mul x8, x2, x2; \ adds x9, x9, x7; \ mul x7, x3, x3; \ adcs x10, x10, x7; \ umulh x7, x3, x3; \ adcs x11, x11, x7; \ mul x7, x4, x4; \ adcs x12, x12, x7; \ umulh x7, x4, x4; \ adcs x13, x13, x7; \ mul x7, x5, x5; \ adcs x14, x14, x7; \ umulh x7, x5, x5; \ adc x6, x6, x7; \ mov x3, #0x26; \ mul x7, x3, x12; \ umulh x4, x3, x12; \ adds x8, x8, x7; \ mul x7, x3, x13; \ umulh x13, x3, x13; \ adcs x9, x9, x7; \ mul x7, x3, x14; \ umulh x14, x3, x14; \ adcs x10, x10, x7; \ mul x7, x3, x6; \ umulh x6, x3, x6; \ adcs x11, x11, x7; \ cset x12, cs; \ adds x11, x11, x14; \ adc x12, x12, x6; \ cmn x11, x11; \ bic x11, x11, #0x8000000000000000; \ adc x2, x12, x12; \ mov x3, #0x13; \ mul x7, x3, x2; \ adds x8, x8, x7; \ adcs x9, x9, x4; \ adcs x10, x10, x13; \ adc x11, x11, xzr; \ stp x8, x9, [P0]; \ stp x10, x11, [P0+16] // Modular addition with double modulus 2 * p_25519 = 2^256 - 38. // This only ensures that the result fits in 4 digits, not that it is reduced // even w.r.t. double modulus. The result is always correct modulo provided // the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ ldp x3, x4, [P1]; \ ldp x7, x8, [P2]; \ adds x3, x3, x7; \ adcs x4, x4, x8; \ ldp x5, x6, [P1+16]; \ ldp x7, x8, [P2+16]; \ adcs x5, x5, x7; \ adcs x6, x6, x8; \ mov x9, #38; \ csel x9, x9, xzr, cs; \ adds x3, x3, x9; \ adcs x4, x4, xzr; \ adcs x5, x5, xzr; \ adc x6, x6, xzr; \ stp x3, x4, [P0]; \ stp x5, x6, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(p0,p1,p2) \ ldp x5, x6, [p1]; \ ldp x4, x3, [p2]; \ subs x5, x5, x4; \ sbcs x6, x6, x3; \ ldp x7, x8, [p1+16]; \ ldp x4, x3, [p2+16]; \ sbcs x7, x7, x4; \ sbcs x8, x8, x3; \ mov x4, #38; \ csel x3, x4, xzr, lo; \ subs x5, x5, x3; \ sbcs x6, x6, xzr; \ sbcs x7, x7, xzr; \ sbc x8, x8, xzr; \ stp x5, x6, [p0]; \ stp x7, x8, [p0+16] // Combined z = c * x + y with reduction only < 2 * p_25519 // where c is initially in the X1 register. It is assumed // that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a // high mul in the final part. #define cmadd_4(p0,p2,p3) \ ldp x7, x8, [p2]; \ ldp x9, x10, [p2+16]; \ mul x3, x1, x7; \ mul x4, x1, x8; \ mul x5, x1, x9; \ mul x6, x1, x10; \ umulh x7, x1, x7; \ umulh x8, x1, x8; \ umulh x9, x1, x9; \ umulh x10, x1, x10; \ adds x4, x4, x7; \ adcs x5, x5, x8; \ adcs x6, x6, x9; \ adc x10, x10, xzr; \ ldp x7, x8, [p3]; \ adds x3, x3, x7; \ adcs x4, x4, x8; \ ldp x7, x8, [p3+16]; \ adcs x5, x5, x7; \ adcs x6, x6, x8; \ adc x10, x10, xzr; \ cmn x6, x6; \ bic x6, x6, #0x8000000000000000; \ adc x8, x10, x10; \ mov x9, #19; \ mul x7, x8, x9; \ adds x3, x3, x7; \ adcs x4, x4, xzr; \ adcs x5, x5, xzr; \ adc x6, x6, xzr; \ stp x3, x4, [p0]; \ stp x5, x6, [p0+16] // Multiplex: z := if NZ then x else y #define mux_4(p0,p1,p2) \ ldp x0, x1, [p1]; \ ldp x2, x3, [p2]; \ csel x0, x0, x2, ne; \ csel x1, x1, x3, ne; \ stp x0, x1, [p0]; \ ldp x0, x1, [p1+16]; \ ldp x2, x3, [p2+16]; \ csel x0, x0, x2, ne; \ csel x1, x1, x3, ne; \ stp x0, x1, [p0+16] S2N_BN_SYMBOL(curve25519_x25519_alt): // Save regs and make room for temporaries stp x19, x20, [sp, -16]! stp x21, x22, [sp, -16]! stp x23, x24, [sp, -16]! sub sp, sp, #NSPACE // Move the output pointer to a stable place mov res, x0 // Copy the inputs to the local variables with minimal mangling: // // - The scalar is in principle turned into 01xxx...xxx000 but // in the structure below the special handling of these bits is // explicit in the main computation; the scalar is just copied. // // - The point x coord is reduced mod 2^255 by masking off the // top bit. In the main loop we only need reduction < 2 * p_25519. ldp x10, x11, [x1] stp x10, x11, [scalar] ldp x12, x13, [x1, #16] stp x12, x13, [scalar+16] ldp x10, x11, [x2] stp x10, x11, [pointx] ldp x12, x13, [x2, #16] and x13, x13, #0x7fffffffffffffff stp x12, x13, [pointx+16] // Initialize with explicit doubling in order to handle set bit 254. // Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1). // We use the fact that the point x coordinate is still in registers. // Since zm = 1 we could do the doubling with an operation count of // 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth // the slight complication arising from a different linear combination. mov swap, #1 stp x10, x11, [xm] stp x12, x13, [xm+16] stp swap, xzr, [zm] stp xzr, xzr, [zm+16] sub_twice4(d,xm,zm) add_twice4(s,xm,zm) sqr_4(d,d) sqr_4(s,s) sub_twice4(p,s,d) mov x1, 0xdb42 orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) mul_4(zn,p,e) // The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive). // This is a classic Montgomery ladder, with the main coordinates only // reduced mod 2 * p_25519, some intermediate results even more loosely. mov i, #253 curve25519_x25519_alt_scalarloop: // sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn sub_twice4(dm,xm,zm) add_twice4(sn,xn,zn) sub_twice4(dn,xn,zn) add_twice4(sm,xm,zm) // ADDING: dmsn = dm * sn // DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) mul_4(dmsn,sn,dm) lsr x0, i, #6 ldr x2, [sp, x0, lsl #3] // Exploiting scalar = sp exactly lsr x2, x2, i and x2, x2, #1 cmp swap, x2 mov swap, x2 mux_4(d,dm,dn) mux_4(s,sm,sn) // ADDING: dnsm = sm * dn mul_4(dnsm,sm,dn) // DOUBLING: d = (xt - zt)^2 sqr_4(d,d) // ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 // DOUBLING: s = (xt + zt)^2 sub_twice4(dpro,dmsn,dnsm) sqr_4(s,s) add_twice4(spro,dmsn,dnsm) sqr_4(dpro,dpro) // DOUBLING: p = 4 * xt * zt = s - d sub_twice4(p,s,d) // ADDING: xm' = (dmsn + dnsm)^2 sqr_4(xm,spro) // DOUBLING: e = 121666 * p + d mov x1, 0xdb42 orr x1, x1, 0x10000 cmadd_4(e,p,d) // DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d mul_4(xn,s,d) // ADDING: zm' = x * (dmsn - dnsm)^2 mul_4(zm,dpro,pointx) // DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) // = p * (d + 121666 * p) mul_4(zn,p,e) // Loop down as far as 3 (inclusive) sub i, i, #1 cmp i, #3 bcs curve25519_x25519_alt_scalarloop // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits // of the scalar. On the very last doubling we *fully* reduce zn mod // p_25519 to ease checking for degeneracy below. cmp swap, xzr mux_4(xn,xm,xn) mux_4(zn,zm,zn) sub_twice4(d,xn,zn) add_twice4(s,xn,zn) sqr_4(d,d) sqr_4(s,s) sub_twice4(p,s,d) mov x1, 0xdb42 orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) mul_4(zn,p,e) sub_twice4(d,xn,zn) add_twice4(s,xn,zn) sqr_4(d,d) sqr_4(s,s) sub_twice4(p,s,d) mov x1, 0xdb42 orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) mul_4(zn,p,e) sub_twice4(d,xn,zn) add_twice4(s,xn,zn) sqr_4(d,d) sqr_4(s,s) sub_twice4(p,s,d) mov x1, 0xdb42 orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) mul_p25519(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). // First set up the constant sn = 2^255 - 19 for the modular inverse. mov x0, #-19 mov x1, #-1 mov x2, #0x7fffffffffffffff stp x0, x1, [sn] stp x1, x2, [sn+16] // Prepare to call the modular inverse function to get zm = 1/zn mov x0, #4 add x1, zm add x2, zn add x3, sn add x4, p // Inline copy of bignum_modinv, identical except for stripping out the // prologue and epilogue saving and restoring registers and the initial // test for k = 0 (which is trivially false here since k = 4). For more // details and explanations see "arm/generic/bignum_modinv.S". lsl x10, x0, #3 add x21, x4, x10 add x22, x21, x10 mov x10, xzr curve25519_x25519_alt_copyloop: ldr x11, [x2, x10, lsl #3] ldr x12, [x3, x10, lsl #3] str x11, [x21, x10, lsl #3] str x12, [x22, x10, lsl #3] str x12, [x4, x10, lsl #3] str xzr, [x1, x10, lsl #3] add x10, x10, #0x1 cmp x10, x0 b.cc curve25519_x25519_alt_copyloop ldr x11, [x4] sub x12, x11, #0x1 str x12, [x4] lsl x20, x11, #2 sub x20, x11, x20 eor x20, x20, #0x2 mov x12, #0x1 madd x12, x11, x20, x12 mul x11, x12, x12 madd x20, x12, x20, x20 mul x12, x11, x11 madd x20, x11, x20, x20 mul x11, x12, x12 madd x20, x12, x20, x20 madd x20, x11, x20, x20 lsl x2, x0, #7 curve25519_x25519_alt_outerloop: add x10, x2, #0x3f lsr x5, x10, #6 cmp x5, x0 csel x5, x0, x5, cs mov x13, xzr mov x15, xzr mov x14, xzr mov x16, xzr mov x19, xzr mov x10, xzr curve25519_x25519_alt_toploop: ldr x11, [x21, x10, lsl #3] ldr x12, [x22, x10, lsl #3] orr x17, x11, x12 cmp x17, xzr and x17, x19, x13 csel x15, x17, x15, ne and x17, x19, x14 csel x16, x17, x16, ne csel x13, x11, x13, ne csel x14, x12, x14, ne csetm x19, ne add x10, x10, #0x1 cmp x10, x5 b.cc curve25519_x25519_alt_toploop orr x11, x13, x14 clz x12, x11 negs x17, x12 lsl x13, x13, x12 csel x15, x15, xzr, ne lsl x14, x14, x12 csel x16, x16, xzr, ne lsr x15, x15, x17 lsr x16, x16, x17 orr x13, x13, x15 orr x14, x14, x16 ldr x15, [x21] ldr x16, [x22] mov x6, #0x1 mov x7, xzr mov x8, xzr mov x9, #0x1 mov x10, #0x3a tst x15, #0x1 curve25519_x25519_alt_innerloop: csel x11, x14, xzr, ne csel x12, x16, xzr, ne csel x17, x8, xzr, ne csel x19, x9, xzr, ne ccmp x13, x14, #0x2, ne sub x11, x13, x11 sub x12, x15, x12 csel x14, x14, x13, cs cneg x11, x11, cc csel x16, x16, x15, cs cneg x15, x12, cc csel x8, x8, x6, cs csel x9, x9, x7, cs tst x12, #0x2 add x6, x6, x17 add x7, x7, x19 lsr x13, x11, #1 lsr x15, x15, #1 add x8, x8, x8 add x9, x9, x9 sub x10, x10, #0x1 cbnz x10, curve25519_x25519_alt_innerloop mov x13, xzr mov x14, xzr mov x17, xzr mov x19, xzr mov x10, xzr curve25519_x25519_alt_congloop: ldr x11, [x4, x10, lsl #3] ldr x12, [x1, x10, lsl #3] mul x15, x6, x11 mul x16, x7, x12 adds x15, x15, x13 umulh x13, x6, x11 adc x13, x13, xzr adds x15, x15, x16 extr x17, x15, x17, #58 str x17, [x4, x10, lsl #3] mov x17, x15 umulh x15, x7, x12 adc x13, x13, x15 mul x15, x8, x11 mul x16, x9, x12 adds x15, x15, x14 umulh x14, x8, x11 adc x14, x14, xzr adds x15, x15, x16 extr x19, x15, x19, #58 str x19, [x1, x10, lsl #3] mov x19, x15 umulh x15, x9, x12 adc x14, x14, x15 add x10, x10, #0x1 cmp x10, x0 b.cc curve25519_x25519_alt_congloop extr x13, x13, x17, #58 extr x14, x14, x19, #58 ldr x11, [x4] mul x17, x11, x20 ldr x12, [x3] mul x15, x17, x12 umulh x16, x17, x12 adds x11, x11, x15 mov x10, #0x1 sub x11, x0, #0x1 cbz x11, curve25519_x25519_alt_wmontend curve25519_x25519_alt_wmontloop: ldr x11, [x3, x10, lsl #3] ldr x12, [x4, x10, lsl #3] mul x15, x17, x11 adcs x12, x12, x16 umulh x16, x17, x11 adc x16, x16, xzr adds x12, x12, x15 sub x15, x10, #0x1 str x12, [x4, x15, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_wmontloop curve25519_x25519_alt_wmontend: adcs x16, x16, x13 adc x13, xzr, xzr sub x15, x10, #0x1 str x16, [x4, x15, lsl #3] negs x10, xzr curve25519_x25519_alt_wcmploop: ldr x11, [x4, x10, lsl #3] ldr x12, [x3, x10, lsl #3] sbcs xzr, x11, x12 add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_wcmploop sbcs xzr, x13, xzr csetm x13, cs negs x10, xzr curve25519_x25519_alt_wcorrloop: ldr x11, [x4, x10, lsl #3] ldr x12, [x3, x10, lsl #3] and x12, x12, x13 sbcs x11, x11, x12 str x11, [x4, x10, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_wcorrloop ldr x11, [x1] mul x17, x11, x20 ldr x12, [x3] mul x15, x17, x12 umulh x16, x17, x12 adds x11, x11, x15 mov x10, #0x1 sub x11, x0, #0x1 cbz x11, curve25519_x25519_alt_zmontend curve25519_x25519_alt_zmontloop: ldr x11, [x3, x10, lsl #3] ldr x12, [x1, x10, lsl #3] mul x15, x17, x11 adcs x12, x12, x16 umulh x16, x17, x11 adc x16, x16, xzr adds x12, x12, x15 sub x15, x10, #0x1 str x12, [x1, x15, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_zmontloop curve25519_x25519_alt_zmontend: adcs x16, x16, x14 adc x14, xzr, xzr sub x15, x10, #0x1 str x16, [x1, x15, lsl #3] negs x10, xzr curve25519_x25519_alt_zcmploop: ldr x11, [x1, x10, lsl #3] ldr x12, [x3, x10, lsl #3] sbcs xzr, x11, x12 add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_zcmploop sbcs xzr, x14, xzr csetm x14, cs negs x10, xzr curve25519_x25519_alt_zcorrloop: ldr x11, [x1, x10, lsl #3] ldr x12, [x3, x10, lsl #3] and x12, x12, x14 sbcs x11, x11, x12 str x11, [x1, x10, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_zcorrloop mov x13, xzr mov x14, xzr mov x17, xzr mov x19, xzr mov x10, xzr curve25519_x25519_alt_crossloop: ldr x11, [x21, x10, lsl #3] ldr x12, [x22, x10, lsl #3] mul x15, x6, x11 mul x16, x7, x12 adds x15, x15, x13 umulh x13, x6, x11 adc x13, x13, xzr subs x15, x15, x16 str x15, [x21, x10, lsl #3] umulh x15, x7, x12 sub x17, x15, x17 sbcs x13, x13, x17 csetm x17, cc mul x15, x8, x11 mul x16, x9, x12 adds x15, x15, x14 umulh x14, x8, x11 adc x14, x14, xzr subs x15, x15, x16 str x15, [x22, x10, lsl #3] umulh x15, x9, x12 sub x19, x15, x19 sbcs x14, x14, x19 csetm x19, cc add x10, x10, #0x1 cmp x10, x5 b.cc curve25519_x25519_alt_crossloop cmn x17, x17 ldr x15, [x21] mov x10, xzr sub x6, x5, #0x1 cbz x6, curve25519_x25519_alt_negskip1 curve25519_x25519_alt_negloop1: add x11, x10, #0x8 ldr x12, [x21, x11] extr x15, x12, x15, #58 eor x15, x15, x17 adcs x15, x15, xzr str x15, [x21, x10] mov x15, x12 add x10, x10, #0x8 sub x6, x6, #0x1 cbnz x6, curve25519_x25519_alt_negloop1 curve25519_x25519_alt_negskip1: extr x15, x13, x15, #58 eor x15, x15, x17 adcs x15, x15, xzr str x15, [x21, x10] cmn x19, x19 ldr x15, [x22] mov x10, xzr sub x6, x5, #0x1 cbz x6, curve25519_x25519_alt_negskip2 curve25519_x25519_alt_negloop2: add x11, x10, #0x8 ldr x12, [x22, x11] extr x15, x12, x15, #58 eor x15, x15, x19 adcs x15, x15, xzr str x15, [x22, x10] mov x15, x12 add x10, x10, #0x8 sub x6, x6, #0x1 cbnz x6, curve25519_x25519_alt_negloop2 curve25519_x25519_alt_negskip2: extr x15, x14, x15, #58 eor x15, x15, x19 adcs x15, x15, xzr str x15, [x22, x10] mov x10, xzr cmn x17, x17 curve25519_x25519_alt_wfliploop: ldr x11, [x3, x10, lsl #3] ldr x12, [x4, x10, lsl #3] and x11, x11, x17 eor x12, x12, x17 adcs x11, x11, x12 str x11, [x4, x10, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_wfliploop mvn x19, x19 mov x10, xzr cmn x19, x19 curve25519_x25519_alt_zfliploop: ldr x11, [x3, x10, lsl #3] ldr x12, [x1, x10, lsl #3] and x11, x11, x19 eor x12, x12, x19 adcs x11, x11, x12 str x11, [x1, x10, lsl #3] add x10, x10, #0x1 sub x11, x10, x0 cbnz x11, curve25519_x25519_alt_zfliploop subs x2, x2, #0x3a b.hi curve25519_x25519_alt_outerloop // Since we eventually want to return 0 when the result is the point at // infinity, we force xn = 0 whenever zn = 0. This avoids building in a // dependency on the behavior of modular inverse in out-of-scope cases. ldp x0, x1, [zn] ldp x2, x3, [zn+16] orr x0, x0, x1 orr x2, x2, x3 orr x4, x0, x2 cmp x4, xzr ldp x0, x1, [xn] csel x0, x0, xzr, ne csel x1, x1, xzr, ne ldp x2, x3, [xn+16] stp x0, x1, [xn] csel x2, x2, xzr, ne csel x3, x3, xzr, ne stp x2, x3, [xn+16] // Now the result is xn * (1/zn), fully reduced modulo p. mul_p25519(resx,xn,zm) // Restore stack and registers add sp, sp, #NSPACE ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack, "", %progbits #endif