// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 // Input x[6]; output z[6] // // extern void bignum_montsqr_p384_alt // (uint64_t z[static 6], uint64_t x[static 6]); // // Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is // guaranteed in particular if x < p_384 initially (the "intended" case). // // Standard x86-64 ABI: RDI = z, RSI = x // Microsoft x64 ABI: RCX = z, RDX = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_alt) .text #define z %rdi #define x %rsi // Some temp registers for the last correction stage #define d %rax #define u %rdx #define v %r10 #define w %r11 // A zero register, very often #define zero %rbp #define zeroe %ebp // Add %rbx * m into a register-pair (high,low) maintaining consistent // carry-catching with carry (negated, as bitmask) and using %rax and %rdx // as temporaries #define mulpadd(carry,high,low,m) \ movq m, %rax ; \ mulq %rbx; \ subq carry, %rdx ; \ addq %rax, low ; \ adcq %rdx, high ; \ sbbq carry, carry // Initial version assuming no carry-in #define mulpadi(carry,high,low,m) \ movq m, %rax ; \ mulq %rbx; \ addq %rax, low ; \ adcq %rdx, high ; \ sbbq carry, carry // End version not catching the top carry-out #define mulpade(carry,high,low,m) \ movq m, %rax ; \ mulq %rbx; \ subq carry, %rdx ; \ addq %rax, low ; \ adcq %rdx, high // Core one-step "short" Montgomery reduction macro. Takes input in // [d5;d4;d3;d2;d1;d0] and returns result in [d6;d5;d4;d3;d2;d1], // adding to the existing [d5;d4;d3;d2;d1] and re-using d0 as a // temporary internally, as well as %rax, %rbx and %rdx. // It is OK for d6 and d0 to be the same register (they often are) // // We want to add (2^384 - 2^128 - 2^96 + 2^32 - 1) * w // where w = [d0 + (d0<<32)] mod 2^64 // // montreds(d6,d5,d4,d3,d2,d1,d0) #define montreds(d6,d5,d4,d3,d2,d1,d0) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ movq d0, %rbx ; \ shlq $32, %rbx ; \ addq d0, %rbx ; \ /* Construct [%rax;%rdx;d0;-] = (2^384 - p_384) * w */ \ /* We know the lowest word will cancel so we can re-use d0 */ \ /* and %rbx as temps. */ \ movq $0xffffffff00000001, %rax ; \ mulq %rbx; \ movq %rdx, d0 ; \ movq $0x00000000ffffffff, %rax ; \ mulq %rbx; \ addq %rax, d0 ; \ movl $0, %eax ; \ adcq %rbx, %rdx ; \ adcl %eax, %eax ; \ /* Now subtract that and add 2^384 * w */ \ subq d0, d1 ; \ sbbq %rdx, d2 ; \ sbbq %rax, d3 ; \ sbbq $0, d4 ; \ sbbq $0, d5 ; \ movq %rbx, d6 ; \ sbbq $0, d6 S2N_BN_SYMBOL(bignum_montsqr_p384_alt): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif // Save more registers to play with pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 // Set up an initial window [%rcx;%r15;...%r9] = [34;05;03;01] // Note that we are using %rcx as the first step past the rotating window movq (x), %rbx movq 8(x), %rax mulq %rbx movq %rax, %r9 movq %rdx, %r10 movq 24(x), %rax mulq %rbx movq %rax, %r11 movq %rdx, %r12 movq 40(x), %rax mulq %rbx movq %rax, %r13 movq %rdx, %r14 movq 24(x), %rax mulq 32(x) movq %rax, %r15 movq %rdx, %rcx // Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window // (no carry-out possible) movq 16(x), %rbx mulpadi(%rbp,%r11,%r10,(x)) mulpadd(%rbp,%r12,%r11,8(x)) movq 8(x), %rbx mulpadd(%rbp,%r13,%r12,24(x)) mulpadd(%rbp,%r14,%r13,32(x)) mulpade(%rbp,%r15,%r14,40(x)) adcq $0, %rcx // Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms // We are running out of registers in our rotating window, so we start // using %rbx (and hence need care with using mulpadd after this). Thus // our result so far is in [%rbp;%rbx;%rcx;%r15;...%r9] movq 32(x), %rbx mulpadi(%rbp,%r13,%r12,(x)) movq 16(x), %rbx mulpadd(%rbp,%r14,%r13,24(x)) mulpadd(%rbp,%r15,%r14,32(x)) mulpadd(%rbp,%rcx,%r15,40(x)) xorl %ebx, %ebx movq 24(x), %rax mulq 40(x) subq %rbp, %rdx xorl %ebp, %ebp addq %rax, %rcx adcq %rdx, %rbx adcl %ebp, %ebp movq 32(x), %rax mulq 40(x) addq %rax, %rbx adcq %rdx, %rbp // Double the window as [%r8;%rbp;%rbx;%rcx;%r15;...%r9] xorl %r8d, %r8d addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq %r15, %r15 adcq %rcx, %rcx adcq %rbx, %rbx adcq %rbp, %rbp adcl %r8d, %r8d // Add the doubled window to the 00 + 11 + 22 + 33 + 44 + 55 terms // For one glorious moment the entire squaring result is all in the // register file as [%rsi;%rbp;%rbx;%rcx;%r15;...;%r8] // (since we've now finished with x we can re-use %rsi). But since // we are so close to running out of registers, we do a bit of // reshuffling and temporary storage in the output buffer. movq (x), %rax mulq %rax movq %r8, (z) movq %rax, %r8 movq 8(x), %rax movq %rbp, 8(z) addq %rdx, %r9 sbbq %rbp, %rbp mulq %rax negq %rbp adcq %rax, %r10 adcq %rdx, %r11 sbbq %rbp, %rbp movq 16(x), %rax mulq %rax negq %rbp adcq %rax, %r12 adcq %rdx, %r13 sbbq %rbp, %rbp movq 24(x), %rax mulq %rax negq %rbp adcq %rax, %r14 adcq %rdx, %r15 sbbq %rbp, %rbp movq 32(x), %rax mulq %rax negq %rbp adcq %rax, %rcx adcq %rdx, %rbx sbbq %rbp, %rbp movq 40(x), %rax mulq %rax negq %rbp adcq 8(z), %rax adcq (z), %rdx movq %rax, %rbp movq %rdx, %rsi // We need just *one* more register as a temp for the Montgomery steps. // Since we are writing to the z buffer anyway, make use of that again // to stash %rbx. movq %rbx, (z) // Montgomery reduce the %r13,...,%r8 window 6 times montreds(%r8,%r13,%r12,%r11,%r10,%r9,%r8) montreds(%r9,%r8,%r13,%r12,%r11,%r10,%r9) montreds(%r10,%r9,%r8,%r13,%r12,%r11,%r10) montreds(%r11,%r10,%r9,%r8,%r13,%r12,%r11) montreds(%r12,%r11,%r10,%r9,%r8,%r13,%r12) montreds(%r13,%r12,%r11,%r10,%r9,%r8,%r13) // Now we can safely restore %rbx before accumulating movq (z), %rbx addq %r8, %r14 adcq %r9, %r15 adcq %r10, %rcx adcq %r11, %rbx adcq %r12, %rbp adcq %r13, %rsi movl $0, %r8d adcq %r8, %r8 // We now have a pre-reduced 7-word form z = [%r8; %rsi;%rbp;%rbx;%rcx;%r15;%r14] // Next, accumulate in different registers z - p_384, or more precisely // // [%r8; %r13;%r12;%r11;%r10;%r9;%rax] = z + (2^384 - p_384) xorq %r11, %r11 xorq %r12, %r12 xorq %r13, %r13 movq $0xffffffff00000001, %rax addq %r14, %rax movl $0x00000000ffffffff, %r9d adcq %r15, %r9 movl $0x0000000000000001, %r10d adcq %rcx, %r10 adcq %rbx, %r11 adcq %rbp, %r12 adcq %rsi, %r13 adcq $0, %r8 // ~ZF <=> %r12 >= 1 <=> z + (2^384 - p_384) >= 2^384 <=> z >= p_384, which // determines whether to use the further reduced argument or the original z. cmovnzq %rax, %r14 cmovnzq %r9, %r15 cmovnzq %r10, %rcx cmovnzq %r11, %rbx cmovnzq %r12, %rbp cmovnzq %r13, %rsi // Write back the result movq %r14, (z) movq %r15, 8(z) movq %rcx, 16(z) movq %rbx, 24(z) movq %rbp, 32(z) movq %rsi, 40(z) // Restore registers and return popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif