// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 // Input x[9]; output z[9] // // extern void bignum_montsqr_p521 // (uint64_t z[static 9], uint64_t x[static 9]); // // Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the // Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is // a Mersenne prime the basic modular squaring bignum_sqr_p521 can be // considered a Montgomery operation to base 2^521. // // Standard x86-64 ABI: RDI = z, RSI = x // Microsoft x64 ABI: RCX = z, RDX = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521) .text #define z %rdi #define x %rsi // A zero register #define zero %rbp #define zeroe %ebp // mulpadd(high,low,i) adds %rdx * x[i] to a register-pair (high,low) // maintaining consistent double-carrying with adcx and adox, // using %rax and %rcx as temporaries. #define mulpadd(high,low,I) \ mulxq I(x), %rax, %rcx ; \ adcxq %rax, low ; \ adoxq %rcx, high // mulpade(high,low,i) adds %rdx * x[i] to a register-pair (high,low) // maintaining consistent double-carrying with adcx and adox, // using %rax as a temporary, assuming high created from scratch // and that zero has value zero. #define mulpade(high,low,I) \ mulxq I(x), %rax, high ; \ adcxq %rax, low ; \ adoxq zero, high S2N_BN_SYMBOL(bignum_montsqr_p521): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif // Save more registers to play with and make temporary space on stack pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $64, %rsp // Do a basic 8x8 squaring stashing %rsp[0..7] but keeping the // top half in the usual rotating register window %r15,...,%r8. Except // for the lack of full writeback this is the same as bignum_sqr_8_16. xorl zeroe, zeroe movq (x), %rdx mulxq 8(x), %r9, %rax movq %r9, 8(%rsp) mulxq 16(x), %r10, %rcx adcxq %rax, %r10 movq %r10, 16(%rsp) mulxq 24(x), %r11, %rax adcxq %rcx, %r11 mulxq 32(x), %r12, %rcx adcxq %rax, %r12 mulxq 40(x), %r13, %rax adcxq %rcx, %r13 mulxq 48(x), %r14, %rcx adcxq %rax, %r14 mulxq 56(x), %r15, %r8 adcxq %rcx, %r15 adcxq zero, %r8 xorl zeroe, zeroe movq 8(x), %rdx mulpadd(%r12,%r11,16) movq %r11, 24(%rsp) mulpadd(%r13,%r12,24) movq %r12, 32(%rsp) mulpadd(%r14,%r13,32) mulpadd(%r15,%r14,40) mulpadd(%r8,%r15,48) mulpade(%r9,%r8,56) movq 32(x), %rdx mulpade(%r10,%r9,40) adcxq zero, %r10 xorl zeroe, zeroe movq 16(x), %rdx mulpadd(%r14,%r13,24) movq %r13, 40(%rsp) mulpadd(%r15,%r14,32) movq %r14, 48(%rsp) mulpadd(%r8,%r15,40) mulpadd(%r9,%r8,48) mulpadd(%r10,%r9,56) movq 48(x), %rdx mulpade(%r11,%r10,32) mulpade(%r12,%r11,40) adcxq zero, %r12 xorl zeroe, zeroe movq 24(x), %rdx mulpadd(%r8,%r15,32) movq %r15, 56(%rsp) mulpadd(%r9,%r8,40) mulpadd(%r10,%r9,48) mulpadd(%r11,%r10,56) movq 56(x), %rdx mulpadd(%r12,%r11,32) mulpade(%r13,%r12,40) mulpade(%r14,%r13,48) adcxq zero, %r14 xorl zeroe, zeroe movq (x), %rdx mulxq %rdx, %rax, %rcx movq %rax, (%rsp) movq 8(%rsp), %rax adcxq %rax, %rax adoxq %rcx, %rax movq %rax, 8(%rsp) movq 16(%rsp), %rax movq 8(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %rax, %rax adoxq %rdx, %rax movq %rax, 16(%rsp) movq 24(%rsp), %rax adcxq %rax, %rax adoxq %rcx, %rax movq %rax, 24(%rsp) movq 32(%rsp), %rax movq 16(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %rax, %rax adoxq %rdx, %rax movq %rax, 32(%rsp) movq 40(%rsp), %rax adcxq %rax, %rax adoxq %rcx, %rax movq %rax, 40(%rsp) movq 48(%rsp), %rax movq 24(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %rax, %rax adoxq %rdx, %rax movq %rax, 48(%rsp) movq 56(%rsp), %rax adcxq %rax, %rax adoxq %rcx, %rax movq %rax, 56(%rsp) movq 32(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %r8, %r8 adoxq %rdx, %r8 adcxq %r9, %r9 adoxq %rcx, %r9 movq 40(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %r10, %r10 adoxq %rdx, %r10 adcxq %r11, %r11 adoxq %rcx, %r11 movq 48(x), %rdx mulxq %rdx, %rdx, %rcx adcxq %r12, %r12 adoxq %rdx, %r12 adcxq %r13, %r13 adoxq %rcx, %r13 movq 56(x), %rdx mulxq %rdx, %rdx, %r15 adcxq %r14, %r14 adoxq %rdx, %r14 adcxq zero, %r15 adoxq zero, %r15 // Augment the high part with the contribution from the top little word C. // If we write the input as 2^512 * C + x then we are otherwise just doing // x^2, so we need to add to the high part 2^512 * C^2 + (2 * C) * x. // The initial doubling add of C also clears the CF and OF flags as desired. // We extend the window now to the 9-element %rbp,%r15,%r14,...,%r8. movq 64(x), %rdx movq %rdx, %rbp imulq %rbp, %rbp addq %rdx, %rdx mulpadd(%r9,%r8,0) mulpadd(%r10,%r9,8) mulpadd(%r11,%r10,16) mulpadd(%r12,%r11,24) mulpadd(%r13,%r12,32) mulpadd(%r14,%r13,40) mulpadd(%r15,%r14,48) mulxq 56(x), %rax, %rcx adcxq %rax, %r15 adoxq %rcx, %rbp adcq $0, %rbp // Rotate the upper portion right 9 bits since 2^512 == 2^-9 (mod p_521) // Let rotated result %rbp,%r15,%r14,...,%r8 be h (high) and %rsp[0..7] be l (low) movq %r8, %rax andq $0x1FF, %rax shrdq $9, %r9, %r8 shrdq $9, %r10, %r9 shrdq $9, %r11, %r10 shrdq $9, %r12, %r11 shrdq $9, %r13, %r12 shrdq $9, %r14, %r13 shrdq $9, %r15, %r14 shrdq $9, %rbp, %r15 shrq $9, %rbp addq %rax, %rbp // Force carry-in then add to get s = h + l + 1 // but actually add all 1s in the top 53 bits to get simple carry out stc adcq (%rsp), %r8 adcq 8(%rsp), %r9 adcq 16(%rsp), %r10 adcq 24(%rsp), %r11 adcq 32(%rsp), %r12 adcq 40(%rsp), %r13 adcq 48(%rsp), %r14 adcq 56(%rsp), %r15 adcq $~0x1FF, %rbp // Now CF is set <=> h + l + 1 >= 2^521 <=> h + l >= p_521, // in which case the lower 521 bits are already right. Otherwise if // CF is clear, we want to subtract 1. Hence subtract the complement // of the carry flag then mask the top word, which scrubs the // padding in either case. cmc sbbq $0, %r8 sbbq $0, %r9 sbbq $0, %r10 sbbq $0, %r11 sbbq $0, %r12 sbbq $0, %r13 sbbq $0, %r14 sbbq $0, %r15 sbbq $0, %rbp andq $0x1FF, %rbp // So far, this has been the same as a pure modular squaring. // Now finally the Montgomery ingredient, which is just a 521-bit // rotation by 9*64 - 521 = 55 bits right. Write digits back as // they are created. movq %r8, %rax shrdq $55, %r9, %r8 movq %r8, (z) shrdq $55, %r10, %r9 movq %r9, 8(z) shrdq $55, %r11, %r10 shlq $9, %rax movq %r10, 16(z) shrdq $55, %r12, %r11 movq %r11, 24(z) shrdq $55, %r13, %r12 movq %r12, 32(z) orq %rax, %rbp shrdq $55, %r14, %r13 movq %r13, 40(z) shrdq $55, %r15, %r14 movq %r14, 48(z) shrdq $55, %rbp, %r15 movq %r15, 56(z) shrq $55, %rbp movq %rbp, 64(z) // Restore registers and return addq $64, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif