// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

// An implementation of the compress function of SHA512 using avx/avx2/avx512
// It was translated from assembly (OpenSSL) to C by
//
// Nir Drucker and Shay Gueron
// AWS Cryptographic Algorithms Group.
// (ndrucker@amazon.com, gueron@amazon.com)

// This file depends on vec_t and on the following macros:
// LOAD, ADD64, ALIGNR8, SRL64, SLL64

#define SHA512_WORD_BIT_LEN (8 * sizeof(sha512_word_t))

_INLINE_ void rotate_x(vec_t x[8])
{
  const vec_t tmp = x[0];

  for(size_t i = 0; i < 7; i++) {
    x[i] = x[i + 1];
  }

  x[7] = tmp;
}

#ifndef ALTERNATIVE_AVX512_IMPL

_INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *K512_p)
{
  vec_t t[4];

  // This function recieves 8 128-bit registers X[7:0]=q[15:0] and calculates:
  // s0 = sigma0(q[(i + 1) % 16])
  // s1 = sigma1(q[(i + 14) % 16])
  // q[i % 16] += s0 + s1 + q[(i + 9) % 16]
  //
  // For X[0]=q[3:0]
  //
  // This means that
  // res[0] depends on q[1] (for s0) q[14] (for s1) and q[9]
  // res[1] depends on q[2] (for s0) q[15] (for s1) and q[10]
  // res[2] depends on q[3] (for s0) res[0] (for s1) and q[11]
  // res[3] depends on q[4] (for s0) res[1] (for s1) and q[12]

  t[0] = ALIGNR8(x[1], x[0], 8);                      // q[2:1]
  t[3] = ALIGNR8(x[5], x[4], 8);                      // q[10:9]
  t[2] = SRL64(t[0], sigma0_0);                       // q[2:1] >> s0[0]
  x[0] = ADD64(x[0], t[3]);                           // q[1:0] + q[10:9]
  t[3] = SRL64(t[0], sigma0_2);                       // q[2:1] >> s0[2]
  t[1] = SLL64(t[0], SHA512_WORD_BIT_LEN - sigma0_1); // q[2:1] << (64 - s0[1])
  t[0] = t[3] ^ t[2];                                 // (q[2:1] >> s0[2]) ^
                                                      //   (q[2:1] >> s0[0])
  t[2] = SRL64(t[2], sigma0_1 - sigma0_0);            // q[2:1] >> s0[1]
  t[0] ^= t[1];                                       // (q[2:1] >> s0[2]) ^
                                                      //  (q[2:1] >> s0[0]) ^
                                                      //  q[2:1] << (64 - s0[1])
  t[1] = SLL64(t[1], sigma0_1 - sigma0_0);            // q[2:1] << (64 - s0[0])
  t[0] ^= t[2] ^ t[1];                                // sigma1(q[2:1])
  t[3] = SRL64(x[7], sigma1_2);                       // q[15:14] >> s1[2]
  t[2] = SLL64(x[7], SHA512_WORD_BIT_LEN - sigma1_1); // q[15:14] >> (64 - s1[1])
  x[0] = ADD64(x[0], t[0]);                           // q[1:0] + sigma0(q[2:1])
  t[1] = SRL64(x[7], sigma1_0);                       // q[15:14] >> s1[0]
  t[3] ^= t[2];                                       // q[15:14] >> s1[2] ^
                                                      //  q[15:14] >> (64 - s1[1])
  t[2] = SLL64(t[2], sigma1_1 - sigma1_0);            // q[15:14] >> (64 - s1[0])
  t[3] ^= t[1];                                       // q[15:14] >> s1[2] ^
                //  q[15:14] >> (64 - s1[1] ^
                //  q[15:14] >> s1[0]
  t[1] = SRL64(t[1], sigma1_1 - sigma1_0); // q[15:14] >> s1[1]
  t[3] ^= t[2] ^ t[1];                     // sigma1(q[15:14])

  // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1])
  x[0] = ADD64(x[0], t[3]);

  rotate_x(x);

  return ADD64(x[7], LOAD(K512_p));
}

#else

_INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *k512_p)
{
  vec_t t[2];
  vec_t s0;
  vec_t s1;

  // This function recieves 8 wide registers X[7:0]=q[15:0] and calculates:
  // s0 = sigma0(q[2:1])
  // s1 = sigma1(q[15:14])
  // q[1:0] += s0 + s1 + q[10:9]

  t[0] = ALIGNR8(x[1], x[0], 8); // q[2:1]
  t[1] = ALIGNR8(x[5], x[4], 8); // q[10:9]
  s0   = ROR64(t[0], sigma0_0) ^ ROR64(t[0], sigma0_1) ^ SRL64(t[0], sigma0_2);
  s1   = ROR64(x[7], sigma1_0) ^ ROR64(x[7], sigma1_1) ^ SRL64(x[7], sigma1_2);
  x[0] = ADD64(ADD64(ADD64(x[0], s1), s0), t[1]);

  rotate_x(x);

  return ADD64(x[7], LOAD(k512_p));
}

#endif