// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 // Inputs x[9], y[9]; output z[9] // // extern void bignum_montmul_p521_alt // (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); // // Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This // means the Montgomery base is the "native size" 2^{9*64} = 2^576; since // p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521 // can be considered a Montgomery operation to base 2^521. // // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_alt) .text #define z %rdi #define x %rsi // This is moved from %rdx to free it for muls #define y %rcx // Macro for the key "multiply and add to (c,h,l)" step #define combadd(c,h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ addq %rax, l ; \ adcq %rdx, h ; \ adcq $0, c // A minutely shorter form for when c = 0 initially #define combadz(c,h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ addq %rax, l ; \ adcq %rdx, h ; \ adcq c, c // A short form where we don't expect a top carry #define combads(h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ addq %rax, l ; \ adcq %rdx, h S2N_BN_SYMBOL(bignum_montmul_p521_alt): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif // Make more registers available and make temporary space on stack pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $72, %rsp // Copy y into a safe register to start with movq %rdx, y // Copy y into a safe register to start with mov %rdx, y // Start doing a conventional columnwise multiplication, // temporarily storing the lower 9 digits to the stack. // Start with result term 0 movq (x), %rax mulq (y) movq %rax, (%rsp) movq %rdx, %r9 xorq %r10, %r10 // Result term 1 xorq %r11, %r11 combads(%r10,%r9,(x),8(y)) combadz(%r11,%r10,%r9,8(x),(y)) movq %r9, 8(%rsp) // Result term 2 xorq %r12, %r12 combadz(%r12,%r11,%r10,(x),16(y)) combadd(%r12,%r11,%r10,8(x),8(y)) combadd(%r12,%r11,%r10,16(x),(y)) movq %r10, 16(%rsp) // Result term 3 xorq %r13, %r13 combadz(%r13,%r12,%r11,(x),24(y)) combadd(%r13,%r12,%r11,8(x),16(y)) combadd(%r13,%r12,%r11,16(x),8(y)) combadd(%r13,%r12,%r11,24(x),(y)) movq %r11, 24(%rsp) // Result term 4 xorq %r14, %r14 combadz(%r14,%r13,%r12,(x),32(y)) combadd(%r14,%r13,%r12,8(x),24(y)) combadd(%r14,%r13,%r12,16(x),16(y)) combadd(%r14,%r13,%r12,24(x),8(y)) combadd(%r14,%r13,%r12,32(x),(y)) movq %r12, 32(%rsp) // Result term 5 xorq %r15, %r15 combadz(%r15,%r14,%r13,(x),40(y)) combadd(%r15,%r14,%r13,8(x),32(y)) combadd(%r15,%r14,%r13,16(x),24(y)) combadd(%r15,%r14,%r13,24(x),16(y)) combadd(%r15,%r14,%r13,32(x),8(y)) combadd(%r15,%r14,%r13,40(x),(y)) movq %r13, 40(%rsp) // Result term 6 xorq %r8, %r8 combadz(%r8,%r15,%r14,(x),48(y)) combadd(%r8,%r15,%r14,8(x),40(y)) combadd(%r8,%r15,%r14,16(x),32(y)) combadd(%r8,%r15,%r14,24(x),24(y)) combadd(%r8,%r15,%r14,32(x),16(y)) combadd(%r8,%r15,%r14,40(x),8(y)) combadd(%r8,%r15,%r14,48(x),(y)) movq %r14, 48(%rsp) // Result term 7 xorq %r9, %r9 combadz(%r9,%r8,%r15,(x),56(y)) combadd(%r9,%r8,%r15,8(x),48(y)) combadd(%r9,%r8,%r15,16(x),40(y)) combadd(%r9,%r8,%r15,24(x),32(y)) combadd(%r9,%r8,%r15,32(x),24(y)) combadd(%r9,%r8,%r15,40(x),16(y)) combadd(%r9,%r8,%r15,48(x),8(y)) combadd(%r9,%r8,%r15,56(x),(y)) movq %r15, 56(%rsp) // Result term 8 xorq %r10, %r10 combadz(%r10,%r9,%r8,(x),64(y)) combadd(%r10,%r9,%r8,8(x),56(y)) combadd(%r10,%r9,%r8,16(x),48(y)) combadd(%r10,%r9,%r8,24(x),40(y)) combadd(%r10,%r9,%r8,32(x),32(y)) combadd(%r10,%r9,%r8,40(x),24(y)) combadd(%r10,%r9,%r8,48(x),16(y)) combadd(%r10,%r9,%r8,56(x),8(y)) combadd(%r10,%r9,%r8,64(x),(y)) movq %r8, 64(%rsp) // At this point we suspend writing back results and collect them // in a register window. Next is result term 9 xorq %r11, %r11 combadz(%r11,%r10,%r9,8(x),64(y)) combadd(%r11,%r10,%r9,16(x),56(y)) combadd(%r11,%r10,%r9,24(x),48(y)) combadd(%r11,%r10,%r9,32(x),40(y)) combadd(%r11,%r10,%r9,40(x),32(y)) combadd(%r11,%r10,%r9,48(x),24(y)) combadd(%r11,%r10,%r9,56(x),16(y)) combadd(%r11,%r10,%r9,64(x),8(y)) // Result term 10 xorq %r12, %r12 combadz(%r12,%r11,%r10,16(x),64(y)) combadd(%r12,%r11,%r10,24(x),56(y)) combadd(%r12,%r11,%r10,32(x),48(y)) combadd(%r12,%r11,%r10,40(x),40(y)) combadd(%r12,%r11,%r10,48(x),32(y)) combadd(%r12,%r11,%r10,56(x),24(y)) combadd(%r12,%r11,%r10,64(x),16(y)) // Result term 11 xorq %r13, %r13 combadz(%r13,%r12,%r11,24(x),64(y)) combadd(%r13,%r12,%r11,32(x),56(y)) combadd(%r13,%r12,%r11,40(x),48(y)) combadd(%r13,%r12,%r11,48(x),40(y)) combadd(%r13,%r12,%r11,56(x),32(y)) combadd(%r13,%r12,%r11,64(x),24(y)) // Result term 12 xorq %r14, %r14 combadz(%r14,%r13,%r12,32(x),64(y)) combadd(%r14,%r13,%r12,40(x),56(y)) combadd(%r14,%r13,%r12,48(x),48(y)) combadd(%r14,%r13,%r12,56(x),40(y)) combadd(%r14,%r13,%r12,64(x),32(y)) // Result term 13 xorq %r15, %r15 combadz(%r15,%r14,%r13,40(x),64(y)) combadd(%r15,%r14,%r13,48(x),56(y)) combadd(%r15,%r14,%r13,56(x),48(y)) combadd(%r15,%r14,%r13,64(x),40(y)) // Result term 14 xorq %r8, %r8 combadz(%r8,%r15,%r14,48(x),64(y)) combadd(%r8,%r15,%r14,56(x),56(y)) combadd(%r8,%r15,%r14,64(x),48(y)) // Result term 15 combads(%r8,%r15,56(x),64(y)) combads(%r8,%r15,64(x),56(y)) // Result term 16 movq 64(x), %rax imulq 64(y), %rax addq %r8, %rax // Now the upper portion is [%rax;%r15;%r14;%r13;%r12;%r11;%r10;%r9;[%rsp+64]]. // Rotate the upper portion right 9 bits since 2^512 == 2^-9 (mod p_521) // Let rotated result %rdx,%r15,%r14,...,%r8 be h (high) and %rsp[0..7] be l (low) movq 64(%rsp), %r8 movq %r8, %rdx andq $0x1FF, %rdx shrdq $9, %r9, %r8 shrdq $9, %r10, %r9 shrdq $9, %r11, %r10 shrdq $9, %r12, %r11 shrdq $9, %r13, %r12 shrdq $9, %r14, %r13 shrdq $9, %r15, %r14 shrdq $9, %rax, %r15 shrq $9, %rax addq %rax, %rdx // Force carry-in then add to get s = h + l + 1 // but actually add all 1s in the top 53 bits to get simple carry out stc adcq (%rsp), %r8 adcq 8(%rsp), %r9 adcq 16(%rsp), %r10 adcq 24(%rsp), %r11 adcq 32(%rsp), %r12 adcq 40(%rsp), %r13 adcq 48(%rsp), %r14 adcq 56(%rsp), %r15 adcq $~0x1FF, %rdx // Now CF is set <=> h + l + 1 >= 2^521 <=> h + l >= p_521, // in which case the lower 521 bits are already right. Otherwise if // CF is clear, we want to subtract 1. Hence subtract the complement // of the carry flag then mask the top word, which scrubs the // padding in either case. cmc sbbq $0, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 sbbq $0, %r12 sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 sbbq $0, %rdx andq $0x1FF, %rdx // So far, this has been the same as a pure modular multiply. // Now finally the Montgomery ingredient, which is just a 521-bit // rotation by 9*64 - 521 = 55 bits right. Write digits back as // they are created. movq %r8, %rax shrdq $55, %r9, %r8 movq %r8, (z) shrdq $55, %r10, %r9 movq %r9, 8(z) shrdq $55, %r11, %r10 shlq $9, %rax movq %r10, 16(z) shrdq $55, %r12, %r11 movq %r11, 24(z) shrdq $55, %r13, %r12 movq %r12, 32(z) orq %rax, %rdx shrdq $55, %r14, %r13 movq %r13, 40(z) shrdq $55, %r15, %r14 movq %r14, 48(z) shrdq $55, %rdx, %r15 movq %r15, 56(z) shrq $55, %rdx movq %rdx, 64(z) // Restore registers and return addq $72, %rsp popq %r15 popq %r14 popq %r13 popq %r12 #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif