// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 // Inputs x[6], y[6]; output z[6] // // extern void bignum_montmul_p384 // (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); // // Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y // satisfy x * y <= 2^384 * p_384 (in particular this is true if we are in // the "usual" case x < p_384 and y < p_384). // // Standard ARM ABI: X0 = z, X1 = x, X2 = y // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384) .text .balign 4 // --------------------------------------------------------------------------- // Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) // c,h,l,t should all be different // t,h should not overlap w,z // --------------------------------------------------------------------------- #define muldiffn(c,h,l, t, x,y, w,z) \ subs t, x, y; \ cneg t, t, cc; \ csetm c, cc; \ subs h, w, z; \ cneg h, h, cc; \ mul l, t, h; \ umulh h, t, h; \ cinv c, c, cc; \ eor l, l, c; \ eor h, h, c // --------------------------------------------------------------------------- // Core one-step "short" Montgomery reduction macro. Takes input in // [d5;d4;d3;d2;d1;d0] and returns result in [d6;d5;d4;d3;d2;d1], // adding to the existing contents of [d5;d4;d3;d2;d1]. It is fine // for d6 to be the same register as d0. // // We want to add (2^384 - 2^128 - 2^96 + 2^32 - 1) * w // where w = [d0 + (d0<<32)] mod 2^64 // --------------------------------------------------------------------------- #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Recycle d0 (which we know gets implicitly cancelled) to store it */ \ lsl t1, d0, #32; \ add d0, t1, d0; \ /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32) */ \ /* We need to subtract 2^32 * this, and we can ignore its lower 32 */ \ /* bits since by design it will cancel anyway; we only need the w_hi */ \ /* part to get the carry propagation going. */ \ lsr t1, d0, #32; \ subs t1, t1, d0; \ sbc t2, d0, xzr; \ /* Now select in t1 the field to subtract from d1 */ \ extr t1, t2, t1, #32; \ /* And now get the terms to subtract from d2 and d3 */ \ lsr t2, t2, #32; \ adds t2, t2, d0; \ adc t3, xzr, xzr; \ /* Do the subtraction of that portion */ \ subs d1, d1, t1; \ sbcs d2, d2, t2; \ sbcs d3, d3, t3; \ sbcs d4, d4, xzr; \ sbcs d5, d5, xzr; \ /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */ \ sbc d6, d0, xzr #define a0 x3 #define a1 x4 #define a2 x5 #define a3 x6 #define a4 x7 #define a5 x8 #define b0 x9 #define b1 x10 #define b2 x11 #define b3 x12 #define b4 x13 #define b5 x14 #define s0 x15 #define s1 x16 #define s2 x17 #define s3 x19 #define s4 x20 #define s5 x1 #define s6 x2 #define t1 x21 #define t2 x22 #define t3 x23 #define t4 x24 S2N_BN_SYMBOL(bignum_montmul_p384): // Save some registers stp x19, x20, [sp, -16]! stp x21, x22, [sp, -16]! stp x23, x24, [sp, -16]! // Load in all words of both inputs ldp a0, a1, [x1] ldp a2, a3, [x1, #16] ldp a4, a5, [x1, #32] ldp b0, b1, [x2] ldp b2, b3, [x2, #16] ldp b4, b5, [x2, #32] // Multiply low halves with a 3x3->6 ADK multiplier as [s5;s4;s3;s2;s1;s0] mul s0, a0, b0 mul t1, a1, b1 mul t2, a2, b2 umulh t3, a0, b0 umulh t4, a1, b1 umulh s5, a2, b2 adds t3, t3, t1 adcs t4, t4, t2 adc s5, s5, xzr adds s1, t3, s0 adcs s2, t4, t3 adcs s3, s5, t4 adc s4, s5, xzr adds s2, s2, s0 adcs s3, s3, t3 adcs s4, s4, t4 adc s5, s5, xzr muldiffn(t3,t2,t1, t4, a0,a1, b1,b0) adds xzr, t3, #1 adcs s1, s1, t1 adcs s2, s2, t2 adcs s3, s3, t3 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a0,a2, b2,b0) adds xzr, t3, #1 adcs s2, s2, t1 adcs s3, s3, t2 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a1,a2, b2,b1) adds xzr, t3, #1 adcs s3, s3, t1 adcs s4, s4, t2 adc s5, s5, t3 // Perform three "short" Montgomery steps on the low product // This shifts it to an offset compatible with middle terms // Stash the result temporarily in the output buffer // We could keep this in registers by directly adding to it in the next // ADK block, but if anything that seems to be slightly slower montreds(s0,s5,s4,s3,s2,s1,s0, t1,t2,t3) montreds(s1,s0,s5,s4,s3,s2,s1, t1,t2,t3) montreds(s2,s1,s0,s5,s4,s3,s2, t1,t2,t3) stp s3, s4, [x0] stp s5, s0, [x0, #16] stp s1, s2, [x0, #32] // Multiply high halves with a 3x3->6 ADK multiplier as [s5;s4;s3;s2;s1;s0] mul s0, a3, b3 mul t1, a4, b4 mul t2, a5, b5 umulh t3, a3, b3 umulh t4, a4, b4 umulh s5, a5, b5 adds t3, t3, t1 adcs t4, t4, t2 adc s5, s5, xzr adds s1, t3, s0 adcs s2, t4, t3 adcs s3, s5, t4 adc s4, s5, xzr adds s2, s2, s0 adcs s3, s3, t3 adcs s4, s4, t4 adc s5, s5, xzr muldiffn(t3,t2,t1, t4, a3,a4, b4,b3) adds xzr, t3, #1 adcs s1, s1, t1 adcs s2, s2, t2 adcs s3, s3, t3 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a3,a5, b5,b3) adds xzr, t3, #1 adcs s2, s2, t1 adcs s3, s3, t2 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a4,a5, b5,b4) adds xzr, t3, #1 adcs s3, s3, t1 adcs s4, s4, t2 adc s5, s5, t3 // Compute sign-magnitude a0,[a5,a4,a3] = x_hi - x_lo subs a3, a3, a0 sbcs a4, a4, a1 sbcs a5, a5, a2 sbc a0, xzr, xzr adds xzr, a0, #1 eor a3, a3, a0 adcs a3, a3, xzr eor a4, a4, a0 adcs a4, a4, xzr eor a5, a5, a0 adc a5, a5, xzr // Compute sign-magnitude b5,[b2,b1,b0] = y_lo - y_hi subs b0, b0, b3 sbcs b1, b1, b4 sbcs b2, b2, b5 sbc b5, xzr, xzr adds xzr, b5, #1 eor b0, b0, b5 adcs b0, b0, xzr eor b1, b1, b5 adcs b1, b1, xzr eor b2, b2, b5 adc b2, b2, xzr // Save the correct sign for the sub-product in b5 eor b5, a0, b5 // Add the high H to the modified low term L' and re-stash 6 words, // keeping top word in s6 ldp t1, t2, [x0] adds s0, s0, t1 adcs s1, s1, t2 ldp t1, t2, [x0, #16] adcs s2, s2, t1 adcs s3, s3, t2 ldp t1, t2, [x0, #32] adcs s4, s4, t1 adcs s5, s5, t2 adc s6, xzr, xzr stp s0, s1, [x0] stp s2, s3, [x0, #16] stp s4, s5, [x0, #32] // Multiply with yet a third 3x3 ADK for the complex mid-term mul s0, a3, b0 mul t1, a4, b1 mul t2, a5, b2 umulh t3, a3, b0 umulh t4, a4, b1 umulh s5, a5, b2 adds t3, t3, t1 adcs t4, t4, t2 adc s5, s5, xzr adds s1, t3, s0 adcs s2, t4, t3 adcs s3, s5, t4 adc s4, s5, xzr adds s2, s2, s0 adcs s3, s3, t3 adcs s4, s4, t4 adc s5, s5, xzr muldiffn(t3,t2,t1, t4, a3,a4, b1,b0) adds xzr, t3, #1 adcs s1, s1, t1 adcs s2, s2, t2 adcs s3, s3, t3 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a3,a5, b2,b0) adds xzr, t3, #1 adcs s2, s2, t1 adcs s3, s3, t2 adcs s4, s4, t3 adc s5, s5, t3 muldiffn(t3,t2,t1, t4, a4,a5, b2,b1) adds xzr, t3, #1 adcs s3, s3, t1 adcs s4, s4, t2 adc s5, s5, t3 // Unstash the H + L' sum to add in twice ldp a0, a1, [x0] ldp a2, a3, [x0, #16] ldp a4, a5, [x0, #32] // Set up a sign-modified version of the mid-product in a long accumulator // as [b3;b2;b1;b0;s5;s4;s3;s2;s1;s0], adding in the H + L' term once with // zero offset as this signed value is created adds xzr, b5, #1 eor s0, s0, b5 adcs s0, s0, a0 eor s1, s1, b5 adcs s1, s1, a1 eor s2, s2, b5 adcs s2, s2, a2 eor s3, s3, b5 adcs s3, s3, a3 eor s4, s4, b5 adcs s4, s4, a4 eor s5, s5, b5 adcs s5, s5, a5 adcs b0, b5, s6 adcs b1, b5, xzr adcs b2, b5, xzr adc b3, b5, xzr // Add in the stashed H + L' term an offset of 3 words as well adds s3, s3, a0 adcs s4, s4, a1 adcs s5, s5, a2 adcs b0, b0, a3 adcs b1, b1, a4 adcs b2, b2, a5 adc b3, b3, s6 // Do three more Montgomery steps on the composed term montreds(s0,s5,s4,s3,s2,s1,s0, t1,t2,t3) montreds(s1,s0,s5,s4,s3,s2,s1, t1,t2,t3) montreds(s2,s1,s0,s5,s4,s3,s2, t1,t2,t3) adds b0, b0, s0 adcs b1, b1, s1 adcs b2, b2, s2 adc b3, b3, xzr // Because of the way we added L' in two places, we can overspill by // more than usual in Montgomery, with the result being only known to // be < 3 * p_384, not the usual < 2 * p_384. So now we do a more // elaborate final correction in the style of bignum_cmul_p384, just // a little bit simpler because we know q is small. add t2, b3, #1 lsl t1, t2, #32 subs t4, t2, t1 sbc t1, t1, xzr adds s3, s3, t4 adcs s4, s4, t1 adcs s5, s5, t2 adcs b0, b0, xzr adcs b1, b1, xzr adcs b2, b2, xzr csetm t2, cc mov t3, #0x00000000ffffffff and t3, t3, t2 adds s3, s3, t3 eor t3, t3, t2 adcs s4, s4, t3 mov t3, #0xfffffffffffffffe and t3, t3, t2 adcs s5, s5, t3 adcs b0, b0, t2 adcs b1, b1, t2 adc b2, b2, t2 // Write back the result stp s3, s4, [x0] stp s5, b0, [x0, #16] stp b1, b2, [x0, #32] // Restore registers and return ldp x23, x24, [sp], #16 ldp x21, x22, [sp], #16 ldp x19, x20, [sp], #16 ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif