/* * All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or * its licensors. * * For complete copyright and license terms please see the LICENSE at the root of this * distribution (the "License"). All use of this software is governed by the License, * or, if provided, by the license below or the license accompanying this file. Do not * remove or modify any license notices. This file is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * */ // Original file Copyright Crytek GMBH or its affiliates, used under license. // Description : unified vector math lib #ifndef __D_VMATH_NEON__ #define __D_VMATH_NEON__ #include typedef float32x4_t vec4; #include "VMath_Prototypes.hpp" #define AVOID_COMPILER_BUG #ifdef AVOID_COMPILER_BUG #define SWIZZLEMASK5(X, Y, Z, W) ((X) | (Y << 2) | (Z << 4) | (W << 6)) #define SWIZZLEMASK4(N, X, Y, Z) N##x = SWIZZLEMASK5(X, Y, Z, 0), \ N##y = SWIZZLEMASK5(X, Y, Z, 1), \ N##z = SWIZZLEMASK5(X, Y, Z, 2), \ N##w = SWIZZLEMASK5(X, Y, Z, 3), #define SWIZZLEMASK3(N, X, Y) SWIZZLEMASK4(N##x, X, Y, 0) \ SWIZZLEMASK4(N##y, X, Y, 1) \ SWIZZLEMASK4(N##z, X, Y, 2) \ SWIZZLEMASK4(N##w, X, Y, 3) #define SWIZZLEMASK2(N, X) SWIZZLEMASK3(N##x, X, 0) \ SWIZZLEMASK3(N##y, X, 1) \ SWIZZLEMASK3(N##z, X, 2) \ SWIZZLEMASK3(N##w, X, 3) #define SWIZZLEMASK1 SWIZZLEMASK2(x, 0) \ SWIZZLEMASK2(y, 1) \ SWIZZLEMASK2(z, 2) \ SWIZZLEMASK2(w, 3) enum ESwizzleMask { SWIZZLEMASK1 }; #else #define SWIZZLEMASK5(X, Y, Z, W) {(X + 0), (X + 1), (X + 2), (X + 3), (Y + 0), (Y + 1), (Y + 2), (Y + 3), (Z + 16), (Z + 17), (Z + 18), (Z + 19), (W + 16), (W + 17), (W + 18), (W + 19) } #define SWIZZLEMASK4(N, X, Y, Z) SWIZZLEMASK5(X, Y, Z, 0), \ SWIZZLEMASK5(X, Y, Z, 4), \ SWIZZLEMASK5(X, Y, Z, 8), \ SWIZZLEMASK5(X, Y, Z, 12), #define SWIZZLEMASK3(N, X, Y) SWIZZLEMASK4(N##x, X, Y, 0) \ SWIZZLEMASK4(N##y, X, Y, 4) \ SWIZZLEMASK4(N##z, X, Y, 8) \ SWIZZLEMASK4(N##w, X, Y, 12) #define SWIZZLEMASK2(N, X) SWIZZLEMASK3(N##x, X, 0) \ SWIZZLEMASK3(N##y, X, 4) \ SWIZZLEMASK3(N##z, X, 8) \ SWIZZLEMASK3(N##w, X, 12) #define SWIZZLEMASK1 SWIZZLEMASK2(x, 0) \ SWIZZLEMASK2(y, 4) \ SWIZZLEMASK2(z, 8) \ SWIZZLEMASK2(w, 12) #define SWIZZLEINDEX5(X, Y, Z, W) (X + Y + Z + W) #define SWIZZLEINDEX4(N, X, Y, Z) N##x = SWIZZLEINDEX5(X, Y, Z, 0), \ N##y = SWIZZLEINDEX5(X, Y, Z, 1), \ N##z = SWIZZLEINDEX5(X, Y, Z, 2), \ N##w = SWIZZLEINDEX5(X, Y, Z, 3), #define SWIZZLEINDEX3(N, X, Y) SWIZZLEINDEX4(N##x, X, Y, 0) \ SWIZZLEINDEX4(N##y, X, Y, 1) \ SWIZZLEINDEX4(N##z, X, Y, 2) \ SWIZZLEINDEX4(N##w, X, Y, 3) #define SWIZZLEINDEX2(N, X) SWIZZLEINDEX3(N##x, X, 0) \ SWIZZLEINDEX3(N##y, X, 1) \ SWIZZLEINDEX3(N##z, X, 2) \ SWIZZLEINDEX3(N##w, X, 3) #define SWIZZLEINDEX1 SWIZZLEINDEX2(x, 0) \ SWIZZLEINDEX2(y, 1) \ SWIZZLEINDEX2(z, 2) \ SWIZZLEINDEX2(w, 3) enum ESwizzleMask { SWIZZLEINDEX1 }; namespace cryengine_vmath_neon_internal { // Contains pre-calculated lookup indices for Shuffle(). Indexed by ESwizzleMask static const uint8x16_t g_SwizzleMasks[] = { SWIZZLEMASK1 }; } // namespace cryengine_vmath_neon_internal #endif // AVOID_COMPILER_BUG enum ECacheLvl { ECL_LVL1, ECL_LVL2, ECL_LVL3, }; /** * Bit masks for use with result from SignMask() */ static const uint32 BitX = 0x00000080; static const uint32 BitY = 0x00008000; static const uint32 BitZ = 0x00800000; static const uint32 BitW = 0x80000000; ILINE vec4 Vec4(float x, float y, float z, float w) { const vec4 Ret = {x, y, z, w}; return Ret; } ILINE vec4 Vec4(uint32 x, uint32 y, uint32 z, uint32 w) { const uint32x4_t Ret = {x, y, z, w}; return (float32x4_t)Ret; } ILINE vec4 Vec4(float x) { return Vec4(x, x, x, x); } template ILINE int32 Vec4int32(vec4 V) { return vgetq_lane_s32((int32x4_t)V, Idx); } template ILINE float Vec4float(vec4 V) { return vgetq_lane_f32(V, Idx); } ILINE int32 Vec4int32(vec4 V, uint32 Idx) { switch (Idx) { case 0: return vgetq_lane_s32((int32x4_t)V, 0); case 1: return vgetq_lane_s32((int32x4_t)V, 1); case 2: return vgetq_lane_s32((int32x4_t)V, 2); case 3: return vgetq_lane_s32((int32x4_t)V, 3); default: assert(0); return 0; } //return V.m_s[Idx]; } ILINE float Vec4float(vec4 V, uint32 Idx) { switch (Idx) { case 0: return vgetq_lane_f32(V, 0); case 1: return vgetq_lane_f32(V, 1); case 2: return vgetq_lane_f32(V, 2); case 3: return vgetq_lane_f32(V, 3); default: assert(0); return 0; } } ILINE vec4 Vec4Zero() { static const float32x4_t V = {0.f, 0.f, 0.f, 0.f}; return V; } ILINE vec4 Vec4One() { static const float32x4_t V = {1.f, 1.f, 1.f, 1.f}; return V; } ILINE vec4 Vec4Four() { static const float32x4_t V = {4.f, 4.f, 4.f, 4.f}; return V; } ILINE vec4 Vec4ZeroOneTwoThree() { static const float32x4_t V = {0.f, 1.f, 2.f, 3.f}; return V; } ILINE vec4 Vec4FFFFFFFF() { return Vec4(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); } ILINE vec4 Vec4Epsilon() { static const float32x4_t V = {FLT_EPSILON, FLT_EPSILON, FLT_EPSILON, FLT_EPSILON}; return V; } /** * Creates vector where each word element is set to value of specified element. * INDEX 0 is x, 1 is y, etc. * @returns {V[INDEX], V[INDEX], V[INDEX], V[INDEX]} */ template ILINE vec4 Splat(vec4 V) { switch (INDEX) { case 0: return vdupq_lane_f32(vget_low_f32(V), 0); case 1: return vdupq_lane_f32(vget_low_f32(V), 1); case 2: // expanded to fix a clang compiler error with the preprocessor return vdupq_lane_f32(vget_high_f32(V), 0); case 3: // expanded to fix a clang compiler error with the preprocessor return vdupq_lane_f32(vget_high_f32(V), 1); default: assert(0); return Vec4FFFFFFFF(); } } template ILINE void Prefetch(const void* pData) { __builtin_prefetch(pData); } /** * Return vector containing words from V0 and V1 based on mask M. * First two elements of M specify word index in V0 and last two elements * specify word index in V1. * * e.g. * Shuffle( M={0,1,2,3} ) => {V0[0], V0[1], V1[2], V1[3]} */ template ILINE vec4 Shuffle(vec4 V0, vec4 V1) { #ifdef AVOID_COMPILER_BUG return Vec4(Vec4float(V0, M & 3), Vec4float(V0, (M >> 2) & 3), Vec4float(V1, (M >> 4) & 3), Vec4float(V1, (M >> 6) & 3)); #else //return Vec4(V0.m_u[M&3],V0.m_u[(M>>2)&3],V1.m_u[(M>>4)&3],V1.m_u[(M>>6)&3]); assert(M >= 0 && M < sizeof(cryengine_vmath_neon_internal::g_SwizzleMasks) / sizeof(cryengine_vmath_neon_internal::g_SwizzleMasks[0])); // Get precomputed vector containing lookup indices for swizzle const uint8x16_t mask = cryengine_vmath_neon_internal::g_SwizzleMasks[ M ]; // Split the lookup indexes into xy and zw pair const uint8x8_t low_index = vget_low_u8(mask); const uint8x8_t high_index = vget_high_u8(mask); // Combine inputs into table; VO=xyzw, V1=XYZW => xyzwXYZW const uint8x8x4_t tbl = { { (uint8x8_t)vget_low_f32(V0), (uint8x8_t)vget_high_f32(V0), (uint8x8_t)vget_low_f32(V1), (uint8x8_t)vget_high_f32(V1), } }; // Look up the xy values, then zw values, then combine them const uint8x8_t low = vtbl4_u8(tbl, low_index); const uint8x8_t high = vtbl4_u8(tbl, high_index); const uint8x16_t ret = vcombine_u8(low, high); return (float32x4_t)ret; #endif // AVOID_COMPILER_BUG } template ILINE vec4 Swizzle(vec4 V) { return Shuffle(V, V); } /** * @returns V0 + V1 */ ILINE vec4 Add(vec4 V0, vec4 V1) { return vaddq_f32(V0, V1); } /** * @returns V0 - V1 */ ILINE vec4 Sub(vec4 V0, vec4 V1) { return vsubq_f32(V0, V1); } /** * @returns V0 * V1 */ ILINE vec4 Mul(vec4 V0, vec4 V1) { return vmulq_f32(V0, V1); } /** * @returns V0 / V1 */ ILINE vec4 Div(vec4 V0, vec4 V1) { return Mul(V0, Rcp(V1)); } /** * @returns 1 / V */ ILINE vec4 RcpFAST(vec4 V) { return vrecpeq_f32(V); } /** * @returns V0 / V1 */ ILINE vec4 DivFAST(vec4 V0, vec4 V1) { return Mul(V0, RcpFAST(V1)); } /** * @returns 1 / V */ ILINE vec4 Rcp(vec4 V) { // Improve estimate precision with a single Newton-Raphson iteration vec4 vEstimate = RcpFAST(V); return vmulq_f32(vrecpsq_f32(V, vEstimate), vEstimate); } /** * @returns V0 * V1 + V2 */ ILINE vec4 Madd(vec4 V0, vec4 V1, vec4 V2) { // Madd return V0*V1+V2 but vmla(a,b,c) does b*c+a return vmlaq_f32(V2, V0, V1); } /** * @returns V0 * V1 - V2 */ ILINE vec4 Msub(vec4 V0, vec4 V1, vec4 V2) { // Msub returns V0*V1-V2 but vmls(a,b,c) does a-b*c return Sub(Mul(V0, V1), V2); } ILINE vec4 Min(vec4 V0, vec4 V1) { return vminq_f32(V0, V1); } ILINE vec4 Max(vec4 V0, vec4 V1) { return vmaxq_f32(V0, V1); } ILINE vec4 floatToint32(vec4 V) { return (float32x4_t)vcvtq_s32_f32(V); } ILINE vec4 int32Tofloat(vec4 V) { return vcvtq_f32_s32((int32x4_t)V); } ILINE vec4 CmpLE(vec4 V0, vec4 V1) { return (float32x4_t)vcleq_f32(V0, V1); } /** * Return sign bit of each word as a single word. Each is accessible via * Bit[X|Y|Z|W] mask. */ ILINE uint32 SignMask(vec4 V) { // Xcode 6.3 removed support for the non-standard 'vtbl1q_p8' in favour // of the standard 'vtbl2_u8' which is not available in Xcode 6.2 #if defined(__AARCH64_SIMD__) && defined(__apple_build_version__) && (__apple_build_version__ < 6020037) const uint8x16_t tbl = vcombine_u8((uint8x8_t)vget_low_f32(V), (uint8x8_t)vget_high_f32(V)); const uint8x8_t idx_sign = {3, 7, 11, 15}; // MSB -> LSB = W -> X (0xWWZZYYXX) const uint8x8_t sign_bytes = vtbl1q_p8(tbl, idx_sign); #elif 1 // Create table from which be select bytes with msb (sign) and combine into // single word const uint8x8x2_t tbl = { { (uint8x8_t)vget_low_f32(V), (uint8x8_t)vget_high_f32(V), } }; const uint8x8_t idx_sign = {3, 7, 11, 15}; // MSB -> LSB = W -> X (0xWWZZYYXX) const uint8x8_t sign_bytes = vtbl2_u8(tbl, idx_sign); #else const int32x4_t rev = vshrq_n_s32((int32x4_t)V, 24); const int16x4_t shorts = vmovn_s32(rev); const uint8x8_t tbl = (uint8x8_t)shorts; const uint8x8_t index = {1, 3, 5, 7}; const uint8x8_t sign_bytes = vtbl1_u8(tbl, index); #endif return vget_lane_u32((uint32x2_t)sign_bytes, 0); //return (V.m_Xu>>31)|((V.m_Yu>>31)<<1)|((V.m_Zu>>31)<<2)|((V.m_Wu>>31)<<3); } /** * @returns V0 & V1 */ ILINE vec4 And(vec4 V0, vec4 V1) { return (float32x4_t)vandq_u32((uint32x4_t)V0, (uint32x4_t)V1); } /** * @returns ~V0 & V1 */ ILINE vec4 AndNot(vec4 V0, vec4 V1) { return (float32x4_t)vandq_u32(vmvnq_u32((uint32x4_t)V0), (uint32x4_t)V1); } /** * @returns V0 | V1 */ ILINE vec4 Or(vec4 V0, vec4 V1) { return (float32x4_t)vorrq_u32((uint32x4_t)V0, (uint32x4_t)V1); } /** * @returns V0 ^ V1 */ ILINE vec4 Xor(vec4 V0, vec4 V1) { return (float32x4_t)veorq_u32((uint32x4_t)V0, (uint32x4_t)V1); } ILINE vec4 Select(vec4 V0, vec4 V1, vec4 M) { return SelectSign(V0, V1, M); } /** * Returns new vector where each word element comes from V0 if the sign of * the corresponding word in M is 0 (+), otherwise the word element comes from * V1. * * e.g. * SelectSign( {0.f,1.f,2.f,3.f}, {a.f,b.f,c.f,d.f}, {0.f,0.f,-1.f,-1.f} ) * => {0.f, 1.f, c.f, d.f} */ ILINE vec4 SelectSign(vec4 V0, vec4 V1, vec4 M) { const int32x4_t mask = vshrq_n_s32((int32x4_t)M, 31); return vbslq_f32((uint32x4_t)mask, V1, V0); //M = ShiftAR(M,31); //return Or(AndNot(M,V0),And(M,V1)); } /** * Expands each byte of VIn as a float. * * e.g. * ExtractByteToFloat( {0x00010203, 0x04...} ) => {3.f, 2.f, 1.f, 0.f} ... */ static ILINE void ExtractByteToFloat(vec4& rVout0, vec4& rVout1, vec4& rVout2, vec4& rVout3, vec4 VIn) { // 16 bytes => 2x 8 shorts const int8x8_t XY8 = vget_low_s8((int8x16_t)VIn); const int8x8_t ZW8 = vget_high_s8((int8x16_t)VIn); const int16x8_t XY16 = vmovl_s8(XY8); const int16x8_t ZW16 = vmovl_s8(ZW8); // 2x 8 shorts => 4x 4 int const int16x4_t X16 = vget_low_s16(XY16); const int16x4_t Y16 = vget_high_s16(XY16); const int16x4_t Z16 = vget_low_s16(ZW16); const int16x4_t W16 = vget_high_s16(ZW16); const int32x4_t X32 = vmovl_s16(X16); const int32x4_t Y32 = vmovl_s16(Y16); const int32x4_t Z32 = vmovl_s16(Z16); const int32x4_t W32 = vmovl_s16(W16); // 4x 4 int => 4x 4 float rVout0 = vcvtq_f32_s32(X32); rVout1 = vcvtq_f32_s32(Y32); rVout2 = vcvtq_f32_s32(Z32); rVout3 = vcvtq_f32_s32(W32); } /** * Returns vector where each bit is from V0 if the corresponding bit in * M is 0, otherwise the bit is from V1 (i.e. vsel instruction from PowerPC, or selb from SPU). * * e.g. * SelectBits(b0001, b1110, b0011) => b0010 */ ILINE vec4 SelectBits(vec4 V0, vec4 V1, vec4 M) { // Order of args to vbsl is exact opposite of SelectBits() return vbslq_f32((uint32x4_t)M, V1, V0); //return Or(AndNot(M,V0),And(M,V1)); } /** * Returns vector where, for each word, all bits are set to 1 if the * corresponding words in V0 and V1 are equal. * * e.g. * CmpEq( {0.f...}, {0.f, 1.f, 2.f...} ) => {0xffffffff, 0x0, 0x0, 0x0} */ ILINE vec4 CmpEq(vec4 V0, vec4 V1) { return (float32x4_t)vceqq_f32(V0, V1); } template ILINE vec4 SelectStatic(vec4 V0, vec4 V1) { const vec4 mask = Vec4(M & 0x1 ? ~0x0u : 0x0u, M & 0x2 ? ~0x0u : 0x0u, M & 0x4 ? ~0x0u : 0x0u, M & 0x8 ? ~0x0u : 0x0u); return Select(V0, V1, mask); } #endif // __D_VMATH_NEON__ #undef AVOID_COMPILER_BUG