// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // An implementation of the compress function of SHA256 using avx/avx2/avx512 // It was translated from assembly (OpenSSL) to C by // // Nir Drucker and Shay Gueron // AWS Cryptographic Algorithms Group. // (ndrucker@amazon.com, gueron@amazon.com) // This file depends on vec_t and on the following macros: // LOAD, ADD32, ALIGNR8, SRL32, SLL32, SRL64 #define SHA256_WORD_BIT_LEN (8 * sizeof(sha256_word_t)) _INLINE_ void rotate_x(vec_t x[4]) { const vec_t tmp = x[0]; x[0] = x[1]; x[1] = x[2]; x[2] = x[3]; x[3] = tmp; } #ifndef ALTERNATIVE_AVX512_IMPL _INLINE_ vec_t sha256_update_x_avx(vec_t x[4], const sha256_word_t *k256_p, const vec_t lo_mask, const vec_t hi_mask) { vec_t t[4]; // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates: // s0 = sigma0(d[(i + 1) % 16]) // s1 = sigma1(d[(i + 14) % 16]) // d[i % 16] += s0 + s1 + d[(i + 9) % 16] // // For x[0]=d[3:0] // // This means that // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9] // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10] // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11] // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12] t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1] t[3] = ALIGNR8(x[3], x[2], 4); // d[12:9] t[2] = SRL32(t[0], sigma0_0); // d[4:1] >> s0[0] x[0] = ADD32(x[0], t[3]); // d[3:0] + d[12:9] t[3] = SRL32(t[0], sigma0_2); // d[4:1] >> s0[2] t[1] = SLL32(t[0], SHA256_WORD_BIT_LEN - sigma0_1); // d[4:1] << (32 - s0[1]) t[0] = t[3] ^ t[2]; // (d[4:1] >> s0[2]) ^ // (d[4:1] >> s0[0]) t[3] = SHUF32(x[3], 0xfa); // d[15,15,14,14] t[2] = SRL32(t[2], sigma0_1 - sigma0_0); // d[4:1] >> s0[1] t[0] ^= t[1] ^ t[2]; // ROTR(d[4:1], s0[1]) ^ // (d[4:1] >> s0[2]) ^ // (d[4:1] >> s0[0]) t[1] = SLL32(t[1], sigma0_1 - sigma0_0); // d[4:1] << (32 - s0[0]) t[2] = SRL32(t[3], sigma1_2); // d[15,15,14,14] >> s1[2] t[3] = SRL64(t[3], sigma1_0); // ROTR(d[-,15,-,14], s1[0]) x[0] = ADD32(x[0], t[0] ^ t[1]); // d[3:0] + sigma0(d[4:1]) t[2] ^= t[3]; // d[15,15,14,14] >> s1[2] ^ ROTR(d[-,15,-,14], s1[0]) t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,15,-,14], s1[1]) t[2] = SHUF8(t[2] ^ t[3], lo_mask); // sigma1(d[Zero,Zero,15,14]) x[0] = ADD32(x[0], t[2]); // d[3:0] + sigma0(d[4:1]) + // sigma1(d[-,-,15,14]) + d[12:9] // When calculating s1 = sigma1(s1) for the upper dwords // we use the already updated d[1:0] t[3] = SHUF32(x[0], 0x50); // d[1,1,0,0] t[2] = SRL32(t[3], sigma1_2); // d[1,1,0,0] >> s1[2] t[3] = SRL64(t[3], sigma1_0); // ROTR(d[-,1,-,0]) >> s1[0] t[2] ^= t[3]; // ROTR(d[-,1,-,0]) >> s1[0] ^ // d[1,1,0,0] >> s1[2] t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,1,-,0]) >> s1[1] // sigma1(d[0,x[1],0,x[0]]) // sigma1(d[x[1],x[0],Zero,Zero]) x[0] = ADD32(x[0], SHUF8(t[2] ^ t[3], hi_mask)); rotate_x(x); return ADD32(x[3], LOAD(k256_p)); } #else _INLINE_ vec_t sha256_update_x_avx(vec_t x[4], const sha256_word_t *k256_p, UNUSED const vec_t lo_mask, UNUSED const vec_t hi_mask) { vec_t t[2]; vec_t s0; vec_t s1; // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates: // s0 = sigma0(d[(i + 1) % 16]) // s1 = sigma1(d[(i + 14) % 16]) // d[i % 16] += s0 + s1 + d[(i + 9) % 16] // // For x[0]=d[3:0] // // This means that // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9] // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10] // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11] // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12] t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1] t[1] = ALIGNR8(x[3], x[2], 4); // d[12:9] x[0] = ADD32(x[0], t[1]); // d[3:0] + d[12:9] s0 = ROR32(t[0], sigma0_0) ^ ROR32(t[0], sigma0_1) ^ SRL32(t[0], sigma0_2); x[0] = ADD32(x[0], s0); // d[3:0] + d[12:9] + sigma0(d[4:1]) t[1] = SHUF32(x[3], 0xfe); // d[-,-,15,14] s1 = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2); x[0] = MADD32(x[0], LOW32X2_MASK, x[0], s1); t[1] = SHUF32(x[0], 0x40); s1 = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2); x[0] = MADD32(x[0], HIGH32X2_MASK, x[0], s1); rotate_x(x); return ADD32(x[3], LOAD(k256_p)); } #endif