// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced // Input x[9]; output z[9] // // extern void bignum_sqr_p521_alt (uint64_t z[static 9], uint64_t x[static 9]); // // Standard x86-64 ABI: RDI = z, RSI = x // Microsoft x64 ABI: RCX = z, RDX = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_alt) .text // Input arguments #define z %rdi #define x %rsi // Macro for the key "multiply and add to (c,h,l)" step #define combadd(c,h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ addq %rax, l ; \ adcq %rdx, h ; \ adcq $0, c // Set up initial window (c,h,l) = numa * numb #define combaddz(c,h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ xorq c, c ; \ movq %rax, l ; \ movq %rdx, h // Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) #define doubladd(c,h,l,hh,ll) \ addq ll, ll ; \ adcq hh, hh ; \ adcq c, c ; \ addq ll, l ; \ adcq hh, h ; \ adcq $0, c // Square term incorporation (c,h,l) += numba^2 #define combadd1(c,h,l,numa) \ movq numa, %rax ; \ mulq %rax; \ addq %rax, l ; \ adcq %rdx, h ; \ adcq $0, c // A short form where we don't expect a top carry #define combads(h,l,numa) \ movq numa, %rax ; \ mulq %rax; \ addq %rax, l ; \ adcq %rdx, h // A version doubling directly before adding, for single non-square terms #define combadd2(c,h,l,numa,numb) \ movq numa, %rax ; \ mulq numb; \ addq %rax, %rax ; \ adcq %rdx, %rdx ; \ adcq $0, c ; \ addq %rax, l ; \ adcq %rdx, h ; \ adcq $0, c S2N_BN_SYMBOL(bignum_sqr_p521_alt): #if WINDOWS_ABI pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif // Make more registers available and make temporary space on stack pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $72, %rsp // Start doing a conventional columnwise squaring, // temporarily storing the lower 9 digits on the stack. // Start with result term 0 movq (x), %rax mulq %rax movq %rax, (%rsp) movq %rdx, %r9 xorq %r10, %r10 // Result term 1 xorq %r11, %r11 combadd2(%r11,%r10,%r9,(x),8(x)) movq %r9, 8(%rsp) // Result term 2 xorq %r12, %r12 combadd1(%r12,%r11,%r10,8(x)) combadd2(%r12,%r11,%r10,(x),16(x)) movq %r10, 16(%rsp) // Result term 3 combaddz(%r13,%rcx,%rbx,(x),24(x)) combadd(%r13,%rcx,%rbx,8(x),16(x)) doubladd(%r13,%r12,%r11,%rcx,%rbx) movq %r11, 24(%rsp) // Result term 4 combaddz(%r14,%rcx,%rbx,(x),32(x)) combadd(%r14,%rcx,%rbx,8(x),24(x)) doubladd(%r14,%r13,%r12,%rcx,%rbx) combadd1(%r14,%r13,%r12,16(x)) movq %r12, 32(%rsp) // Result term 5 combaddz(%r15,%rcx,%rbx,(x),40(x)) combadd(%r15,%rcx,%rbx,8(x),32(x)) combadd(%r15,%rcx,%rbx,16(x),24(x)) doubladd(%r15,%r14,%r13,%rcx,%rbx) movq %r13, 40(%rsp) // Result term 6 combaddz(%r8,%rcx,%rbx,(x),48(x)) combadd(%r8,%rcx,%rbx,8(x),40(x)) combadd(%r8,%rcx,%rbx,16(x),32(x)) doubladd(%r8,%r15,%r14,%rcx,%rbx) combadd1(%r8,%r15,%r14,24(x)) movq %r14, 48(%rsp) // Result term 7 combaddz(%r9,%rcx,%rbx,(x),56(x)) combadd(%r9,%rcx,%rbx,8(x),48(x)) combadd(%r9,%rcx,%rbx,16(x),40(x)) combadd(%r9,%rcx,%rbx,24(x),32(x)) doubladd(%r9,%r8,%r15,%rcx,%rbx) movq %r15, 56(%rsp) // Result term 8 combaddz(%r10,%rcx,%rbx,(x),64(x)) combadd(%r10,%rcx,%rbx,8(x),56(x)) combadd(%r10,%rcx,%rbx,16(x),48(x)) combadd(%r10,%rcx,%rbx,24(x),40(x)) doubladd(%r10,%r9,%r8,%rcx,%rbx) combadd1(%r10,%r9,%r8,32(x)) movq %r8, 64(%rsp) // We now stop writing back and keep remaining results in a register window. // Continue with result term 9 combaddz(%r11,%rcx,%rbx,8(x),64(x)) combadd(%r11,%rcx,%rbx,16(x),56(x)) combadd(%r11,%rcx,%rbx,24(x),48(x)) combadd(%r11,%rcx,%rbx,32(x),40(x)) doubladd(%r11,%r10,%r9,%rcx,%rbx) // Result term 10 combaddz(%r12,%rcx,%rbx,16(x),64(x)) combadd(%r12,%rcx,%rbx,24(x),56(x)) combadd(%r12,%rcx,%rbx,32(x),48(x)) doubladd(%r12,%r11,%r10,%rcx,%rbx) combadd1(%r12,%r11,%r10,40(x)) // Result term 11 combaddz(%r13,%rcx,%rbx,24(x),64(x)) combadd(%r13,%rcx,%rbx,32(x),56(x)) combadd(%r13,%rcx,%rbx,40(x),48(x)) doubladd(%r13,%r12,%r11,%rcx,%rbx) // Result term 12 combaddz(%r14,%rcx,%rbx,32(x),64(x)) combadd(%r14,%rcx,%rbx,40(x),56(x)) doubladd(%r14,%r13,%r12,%rcx,%rbx) combadd1(%r14,%r13,%r12,48(x)) // Result term 13 combaddz(%r15,%rcx,%rbx,40(x),64(x)) combadd(%r15,%rcx,%rbx,48(x),56(x)) doubladd(%r15,%r14,%r13,%rcx,%rbx); // Result term 14 xorq %r8, %r8 combadd1(%r8,%r15,%r14,56(x)) combadd2(%r8,%r15,%r14,48(x),64(x)) // Result term 15 movq 56(x), %rax mulq 64(x) addq %rax, %rax adcq %rdx, %rdx addq %rax, %r15 adcq %rdx, %r8 // Result term 16 movq 64(x), %rax imulq %rax, %rax addq %r8, %rax // Now the upper portion is [%rax;%r15;%r14;%r13;%r12;%r11;%r10;%r9;[%rsp+64]]. // Rotate the upper portion right 9 bits since 2^512 == 2^-9 (mod p_521) // Let rotated result %rdx,%r15,%r14,...,%r8 be h (high) and %rsp[0..7] be l (low) movq 64(%rsp), %r8 movq %r8, %rdx andq $0x1FF, %rdx shrdq $9, %r9, %r8 shrdq $9, %r10, %r9 shrdq $9, %r11, %r10 shrdq $9, %r12, %r11 shrdq $9, %r13, %r12 shrdq $9, %r14, %r13 shrdq $9, %r15, %r14 shrdq $9, %rax, %r15 shrq $9, %rax addq %rax, %rdx // Force carry-in then add to get s = h + l + 1 // but actually add all 1s in the top 53 bits to get simple carry out stc adcq (%rsp), %r8 adcq 8(%rsp), %r9 adcq 16(%rsp), %r10 adcq 24(%rsp), %r11 adcq 32(%rsp), %r12 adcq 40(%rsp), %r13 adcq 48(%rsp), %r14 adcq 56(%rsp), %r15 adcq $~0x1FF, %rdx // Now CF is set <=> h + l + 1 >= 2^521 <=> h + l >= p_521, // in which case the lower 521 bits are already right. Otherwise if // CF is clear, we want to subtract 1. Hence subtract the complement // of the carry flag then mask the top word, which scrubs the // padding in either case. Write digits back as they are created. cmc sbbq $0, %r8 movq %r8, (z) sbbq $0, %r9 movq %r9, 8(z) sbbq $0, %r10 movq %r10, 16(z) sbbq $0, %r11 movq %r11, 24(z) sbbq $0, %r12 movq %r12, 32(z) sbbq $0, %r13 movq %r13, 40(z) sbbq $0, %r14 movq %r14, 48(z) sbbq $0, %r15 movq %r15, 56(z) sbbq $0, %rdx andq $0x1FF, %rdx movq %rdx, 64(z) // Restore registers and return addq $72, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx #if WINDOWS_ABI popq %rsi popq %rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif