/*
* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
* its licensors.
*
* For complete copyright and license terms please see the LICENSE at the root of this
* distribution (the "License"). All use of this software is governed by the License,
* or, if provided, by the license below or the license accompanying this file. Do not
* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
*/
// Original file Copyright Crytek GMBH or its affiliates, used under license.

#pragma once

#include "VMath.hpp"
#include <AzCore/Debug/Profiler.h>

//#define CULL_RENDERER_REPROJ_DEBUG
#define CULL_RENDERER_MINZ

// enable this define to allow ingame debugging of the coverage buffer
#define CULLING_ENABLE_DEBUG_OVERLAY

extern  SHWOccZBuffer HWZBuffer;


#if defined(AZ_RESTRICTED_PLATFORM)
    #if defined(AZ_PLATFORM_XENIA)
        #include "Xenia/CCullRenderer_h_xenia.inl"
    #elif defined(AZ_PLATFORM_PROVO)
        #include "Provo/CCullRenderer_h_provo.inl"
    #elif defined(AZ_PLATFORM_SALEM)
        #include "Salem/CCullRenderer_h_salem.inl"
    #endif
#endif
#if defined(AZ_RESTRICTED_SECTION_IMPLEMENTED)
#undef AZ_RESTRICTED_SECTION_IMPLEMENTED
#elif defined(WIN64)
#define CULLINLINE inline
#define CULLNOINLINE inline
#else
    #define CULLINLINE ILINE
    #define CULLNOINLINE inline
#endif

namespace NAsyncCull {
    namespace Debug {
        inline void Draw2DBox(float fX, float fY, float fHeight, float fWidth, const ColorB& rColor, float fScreenHeight, float fScreenWidth, IRenderAuxGeom* pAuxRenderer)
        {
            float fPosition[4][2] = {
                { fX,                       fY },
                { fX,                       fY + fHeight },
                { fX + fWidth,  fY + fHeight },
                { fX + fWidth, fY}
            };

            // compute normalized position from absolute points
            Vec3 vPosition[4] = {
                Vec3(fPosition[0][0] / fScreenWidth, fPosition[0][1] / fScreenHeight, 0.0f),
                Vec3(fPosition[1][0] / fScreenWidth, fPosition[1][1] / fScreenHeight, 0.0f),
                Vec3(fPosition[2][0] / fScreenWidth, fPosition[2][1] / fScreenHeight, 0.0f),
                Vec3(fPosition[3][0] / fScreenWidth, fPosition[3][1] / fScreenHeight, 0.0f)
            };

            vtx_idx const anTriangleIndices[6] = {
                0, 1, 2,
                0, 2, 3
            };

            pAuxRenderer->DrawTriangles(vPosition, 4, anTriangleIndices, 6, rColor);
        }
    } // namesapce Debug
} //namespace NasyncCull

namespace NAsyncCull
{
    typedef float                                           tdZexel;
    typedef uint16                                      tdIndex;

    typedef PodArray<NVMath::vec4>& tdVertexCacheArg;
    typedef PodArray<NVMath::vec4>  tdVertexCache;

    enum
    {
        VERTEX_CACHE_COUNT = 64 * 1024
    };

    extern const NVMath::vec4 MaskNot3;

    template<uint32 SIZEX, uint32 SIZEY>
    class CCullRenderer
    {
    public:
        enum
        {
            RESOLUTION_X = SIZEX
        };
        enum
        {
            RESOLUTION_Y = SIZEY
        };
    private:

        NVMath::vec4                    m_VMaxXY        _ALIGN(16);
        static float                    m_ZBufferMainMemory[SIZEX * SIZEY]    _ALIGN(128);
        uint32                              m_SizeX4;
        _MS_ALIGN(16) float     m_Reproject[16] _ALIGN(16);
        uint32                              m_nNumWorker;
        tdZexel*                            m_ZBuffer;

        tdZexel** m_ZBufferSwap;

        DEFINE_ALIGNED_DATA(tdZexel, m_ZBufferSwapMerged[SIZEX * SIZEY], 128); // 128 byte for XMemSet128

#ifdef CULL_RENDERER_REPROJ_DEBUG
        tdZexel                             m_ZBufferOrig[SIZEX * SIZEY];
#endif

        uint32                              m_DrawCall;
        uint32                              m_PolyCount;

        template<bool WRITE, bool CULL, bool CULL_BACKFACES>
        CULLINLINE  bool            Triangle(const  NVMath::vec4& rV0,
            const  NVMath::vec4& rV1,
            const  NVMath::vec4& rV2)
        {
            using namespace NVMath;
            vec4 V0     =   rV0;
            vec4 V1     =   rV1;
            vec4 V2     =   rV2;

            const   uint32  Idx =   SignMask(Shuffle<xzzz>(Shuffle<zzzz>(V0, V1), V2)) & (BitX | BitY | BitZ);
            if (Idx == (BitX | BitY | BitZ))
            {
                return false;
            }

            bool Visible = false;

            switch (Idx)
            {
            case 0:
                break;
            case BitX:
            {
                const vec4  F0  =   Splat<2>(V0);
                const vec4  F1  =   Splat<2>(V1);
                const vec4  F2  =   Splat<2>(V2);
                const vec4  M0  =   Div(F0, Sub(F0, F2));
                const vec4  M1  =   Div(F0, Sub(F0, F1));
                const vec4  P0  =   Madd(Sub(V2, V0), M0, V0);
                const vec4  P1  =   Madd(Sub(V1, V0), M1, V0);
                Visible = Triangle2D<WRITE, CULL, true, CULL_BACKFACES>(P0, P1, V1);
                V0  =   P0;
            }
            break;
            case BitY:
            {
                const vec4  F0  =   Splat<2>(V0);
                const vec4  F1  =   Splat<2>(V1);
                const vec4  F2  =   Splat<2>(V2);
                const vec4  M0  =   Div(F1, Sub(F1, F0));
                const vec4  M1  =   Div(F1, Sub(F1, F2));
                const vec4  P0  =   Madd(Sub(V0, V1), M0, V1);
                const vec4  P1  =   Madd(Sub(V2, V1), M1, V1);
                Visible = Triangle2D<WRITE, CULL, true, CULL_BACKFACES>(P0, P1, V2);
                V1  =   P0;
            }
            break;
            case BitX | BitY:
            {
                const vec4  F0  =   Splat<2>(V0);
                const vec4  F1  =   Splat<2>(V1);
                const vec4  F2  =   Splat<2>(V2);
                const vec4  M0  =   Div(F0, Sub(F0, F2));
                const vec4  M1  =   Div(F1, Sub(F1, F2));
                V0  =   Madd(Sub(V2, V0), M0, V0);
                V1  =   Madd(Sub(V2, V1), M1, V1);
            }
            break;
            case BitZ:
            {
                const vec4  F0  =   Splat<2>(V0);
                const vec4  F1  =   Splat<2>(V1);
                const vec4  F2  =   Splat<2>(V2);
                const vec4  M0  =   Div(F2, Sub(F2, F1));
                const vec4  M1  =   Div(F2, Sub(F2, F0));
                const vec4  P0  =   Madd(Sub(V1, V2), M0, V2);
                const vec4  P1  =   Madd(Sub(V0, V2), M1, V2);
                Visible = Triangle2D<WRITE, CULL, true, CULL_BACKFACES>(V0, P0, P1);
                V2  =   P0;
            }
            break;
            case BitX | BitZ:
            {
                const vec4  F0  =   Splat<2>(V0);
                const vec4  F1  =   Splat<2>(V1);
                const vec4  F2  =   Splat<2>(V2);
                const vec4  M0  =   Div(F0, Sub(F0, F1));
                const vec4  M1  =   Div(F2, Sub(F2, F1));
                V0  =   Madd(Sub(V1, V0), M0, V0);
                V2  =   Madd(Sub(V1, V2), M1, V2);
            }
            break;
            case BitY | BitZ:
            {
                const vec4 F0   =   Splat<2>(V0);
                const vec4 F1   =   Splat<2>(V1);
                const vec4 F2   =   Splat<2>(V2);
                const vec4  M0  =   Div(F1, Sub(F1, F0));
                const vec4  M1  =   Div(F2, Sub(F2, F0));
                V1  =   Madd(Sub(V0, V1), M0, V1);
                V2  =   Madd(Sub(V0, V2), M1, V2);
            }
            break;
            case BitX | BitY | BitZ:
                break;
#if AZ_TRAIT_COMPILER_OPTIMIZE_MISSING_DEFAULT_SWITCH_CASE
            default:
                __assume(0);
#endif
            }
            return Visible | Triangle2D<WRITE, CULL, true, CULL_BACKFACES>(V0, V1, V2);
        }


        template<bool WRITE, bool CULL, bool PROJECT, bool CULL_BACKFACES>
#if AZ_TRAIT_COMPILER_PASS_4PLUS_VECTOR_PARAMETERS_BY_VALUE
        CULLINLINE  bool            Triangle2D(NVMath::vec4 rV0, NVMath::vec4 rV1, NVMath::vec4 rV2, uint32 MinX = 0, uint32 MinY = 0, uint32 MaxX = 0, uint32 MaxY = 0, NVMath::vec4  VMinMax = NVMath::Vec4Zero(), NVMath::vec4 V210 = NVMath::Vec4Zero())
#else
        CULLINLINE  bool            Triangle2D(NVMath::vec4 rV0, NVMath::vec4 rV1, NVMath::vec4 rV2, uint32 MinX = 0, uint32 MinY = 0, uint32 MaxX = 0, uint32 MaxY = 0, NVMath::vec4& VMinMax = NVMath::Vec4Zero(), NVMath::vec4& V210 = NVMath::Vec4Zero())
#endif
        {
            using namespace NVMath;

            vec4 V0, V1, V2;
            if (PROJECT)
            {
                const   vec4    WWW     =   Shuffle<xzww>(Shuffle<wwww>(rV0, rV1), rV2);
                const   vec4    iWWW    =   Rcp(WWW);
                V0  =   Mul(rV0, Splat<0>(iWWW));
                V1  =   Mul(rV1, Splat<1>(iWWW));
                V2  =   Mul(rV2, Splat<2>(iWWW));
                V210 =   Sub(Shuffle<xyxy>(V1, V2), Swizzle<xyxy>(V0));
                vec4 Det    =   Mul(V210, Swizzle<wzwz>(V210));
                Det =   Sub(Det, Splat<1>(Det));
                if (CULL_BACKFACES)
                {
                    if ((SignMask(CmpLE(Det, Vec4Epsilon())) & BitX) != 0)
                    {
                        return false;
                    }
                }

                Det = Select(Det, NVMath::Vec4(-FLT_EPSILON), CmpEq(Det, Vec4Zero()));
                V210        =   Div(V210, Swizzle<xxxx>(Det));

                vec4 VMax   =   Max(Max(V0, V1), V2);
                vec4 VMin   =   Min(Min(V0, V1), V2);
                VMax        =   Add(VMax, Vec4One());
                VMinMax =   Shuffle<xyxy>(VMin, VMax);
                VMinMax =   Max(VMinMax, Vec4Zero());
                VMinMax =   Min(VMinMax, m_VMaxXY);
                VMinMax =   floatToint32(VMinMax);
                const uint32* pMM       =   reinterpret_cast<uint32*>(&VMinMax);
                MinX    =   pMM[0];
                MinY    =   pMM[1];
                MaxX    =   pMM[2];
                MaxY    =   pMM[3];
                if (MinX >= MaxX || MinY >= MaxY)
                {
                    return false;
                }
            }
            else
            {
                V0  =   rV0;
                V1  =   rV1;
                V2  =   rV2;
            }


            MinX &= ~3;

            VMinMax =   And(VMinMax, MaskNot3);

#ifdef CULL_RENDERER_MINZ
            const vec4  VMinZ   =   Splat<2>(Min(Min(rV0, rV1), rV2));
#endif
            const vec4  V0z =   Splat<2>(rV0);
            const vec4  Z10 =   Sub(Splat<2>(rV1), V0z);
            const vec4  Z20 =   Sub(Splat<2>(rV2), V0z);

            const vec4 X20  =   Splat<0>(V210);
            const vec4 Y20  =   Splat<1>(V210);
            const vec4 X10  =   Sub(Vec4Zero(), Splat<2>(V210));
            const vec4 Y10  =   Splat<3>(V210);

            VMinMax =   Sub(int32Tofloat(VMinMax), V0);
            const vec4  dx4 =   Add(Splat<0>(VMinMax), Vec4ZeroOneTwoThree());
            const vec4  Y1x =   Mul(Y10, dx4);
            const vec4  Y2x =   Sub(Vec4Zero(), Mul(Y20, dx4));
            vec4    dy4 =   Splat<1>(VMinMax);
            const vec4  Y14 =   Mul(Y10, Vec4Four());
            const vec4  Y24 =   Sub(Vec4Zero(), Mul(Y20, Vec4Four()));
            const vec4  Y34 =   Add(Y14, Y24);
            vec4 Visible    =   Vec4FFFFFFFF();
            uint16 y = MinY;
            do
            {
                vec4    Px  =   Madd(X10, dy4, Y1x);
                vec4    Py  =   Madd(X20, dy4, Y2x);
                vec4    Pz  =   Sub(Sub(Vec4One(), Py), Px);

                vec4*   pDstZ   =   reinterpret_cast<vec4*>(&m_ZBuffer[MinX + y * (uint16)SIZEX]);
                y++;
                uint16 x = MinX;
                do
                {
                    Prefetch<ECL_LVL1>(pDstZ);
                    x               +=  4;
                    vec4    Mask        =   Or(Or(Px, Py), Pz);
                    vec4 Z,     rZ  =   *pDstZ;
#ifdef CULL_RENDERER_MINZ
                    if (!WRITE)                                         //compile time
                    {
                        Mask        =   Or(Mask, CmpLE(rZ, VMinZ));
                    }
                    else
#endif
                    {
                        Z               =   Madd(Z10, Px, Madd(Z20, Py, V0z));
                        Mask        =   Or(Mask, CmpLE(rZ, Z));
                    }
                    Px          =   Add(Px, Y14);
                    Py          =   Add(Py, Y24);
                    Pz          =   Sub(Pz, Y34);
                    if (CULL)                                           //compile time
                    {
                        Visible =   And(Visible, Mask);
                    }
                    if (WRITE)                                          //compile time
                    {
                        *pDstZ  =   SelectSign(Z, rZ, Mask);
                    }
                    pDstZ++;
                } while (x < MaxX);
                if (!WRITE && CULL && (SignMask(Visible) & (BitX | BitY | BitZ | BitW)) != (BitX | BitY | BitZ | BitW))
                {
                    return true;
                }
                dy4 = Add(dy4, Vec4One());
            } while (y < MaxY);
            return CULL && (SignMask(Visible) & (BitX | BitY | BitZ | BitW)) != (BitX | BitY | BitZ | BitW);
        }


        CULLINLINE  bool            Quad2D(const NVMath::vec4& rV0, const NVMath::vec4& rV1, const NVMath::vec4& rV3, const NVMath::vec4& rV2)
        {
            using namespace NVMath;
            const   vec4    WWW     =   Shuffle<xzxz>(Shuffle<wwww>(rV0, rV1), Shuffle<wwww>(rV2, rV3));
            const   vec4    iWWW    =   Rcp(WWW);

            vec4 V0 =   Mul(rV0, Splat<0>(iWWW));
            vec4 V1 =   Mul(rV1, Splat<1>(iWWW));
            vec4 V2 =   Mul(rV2, Splat<2>(iWWW));
            vec4 V3 =   Mul(rV3, Splat<3>(iWWW));

            vec4 V210   =   Sub(Shuffle<xyxy>(V1, V2), Swizzle<xyxy>(V0));
            vec4 V213   =   Sub(Shuffle<xyxy>(V1, V2), Swizzle<xyxy>(V3));
            vec4 Det    =   Mul(V210, Swizzle<wzwz>(V210));
            Det             =   Sub(Det, Splat<1>(Det));

            vec4 VMax   =   Max(Max(V0, V1), Max(V2, V3));
            vec4 VMin   =   Min(Min(V0, V1), Min(V2, V3));
            VMax        =   Add(VMax, Vec4One());
            //saturate to 0 - ScreenSize cause it's assigned to uin16
            VMin        =   Min(VMin, m_VMaxXY);
            VMax        =   Min(VMax, m_VMaxXY);

            vec4    VMinMax =   floatToint32(Max(Shuffle<xyxy>(VMin, VMax), Vec4Zero()));
            uint16  MinX    =   Vec4int32(VMinMax, 0);
            const uint16    MinY    =   Vec4int32(VMinMax, 1);
            const uint16    MaxX    =   Vec4int32(VMinMax, 2);
            const uint16    MaxY    =   Vec4int32(VMinMax, 3);
            if (MinX >= MaxX || MinY >= MaxY)
            {
                return false;
            }
            MinX &= ~3;

            const vec4  VMinZ   =   Splat<2>(Min(Min(rV0, rV1), Min(rV2, rV3)));
            Det     =   Rcp(Splat<0>(Det));
            V210    =   Mul(V210, Det);
            V213    =   Mul(V213, Det);
            const vec4 X20  =   Splat<0>(V210);
            const vec4 Y20  =   Splat<1>(V210);
            const vec4 X10  =   Splat<2>(V210);
            const vec4 Y10  =   Splat<3>(V210);
            const vec4 X23  =   Splat<0>(V213);
            const vec4 Y23  =   Splat<1>(V213);
            const vec4 X13  =   Splat<2>(V213);
            const vec4 Y13  =   Splat<3>(V213);

            const vec4  dx4 =   Sub(Add(NVMath::Vec4(static_cast<float>(MinX)), Vec4ZeroOneTwoThree()), Splat<0>(V0));
            const vec4  Y10x    =   Mul(Y10, dx4);
            const vec4  Y20x    =   Mul(Y20, dx4);
            const vec4  Y13x    =   Mul(Y13, dx4);
            const vec4  Y23x    =   Mul(Y23, dx4);
            vec4    dy4 =   Sub(NVMath::Vec4(static_cast<float>(MinY)), Splat<1>(V0));
            const vec4  Y104    =   Mul(Y10, Vec4Four());
            const vec4  Y204    =   Mul(Y20, Vec4Four());
            const vec4  Y134    =   Mul(Y13, Vec4Four());
            const vec4  Y234    =   Mul(Y23, Vec4Four());
            const vec4  Y304    =   Sub(Y104, Y204);
            const vec4  Y334    =   Sub(Y134, Y234);
            vec4 Visible    =   Vec4FFFFFFFF();
            uint16 y = MinY;
            do
            {
                vec4    P0x =   Sub(Y10x, Mul(X10, dy4));
                vec4    P0y =   Sub(Mul(X20, dy4), Y20x);
                vec4    P3x =   Sub(Y13x, Mul(X13, dy4));
                vec4    P3y =   Sub(Mul(X23, dy4), Y23x);
                uint16 x        =   MinX;
                vec4*   pDstZ   =   reinterpret_cast<vec4*>(&m_ZBuffer[MinX + y * (uint16)SIZEX]);
                do
                {
                    Prefetch<ECL_LVL1>(pDstZ);
                    vec4 Mask   =   Or(Or(P0x, P0y), Or(P3x, P3y));
                    vec4 rZ =   *pDstZ++;
                    Mask        =   Or(Mask, CmpLE(rZ, VMinZ));
                    x               += 4;
                    Visible =   And(Visible, Mask);
                    P0x         =   Add(P0x, Y104);
                    P0y         =   Sub(P0y, Y204);
                    P3x         =   Add(P3x, Y134);
                    P3y         =   Sub(P3y, Y234);
                } while (x < MaxX);
                if (SignMask(Visible) != (BitX | BitY | BitZ | BitW))
                {
                    return true;
                }
                y++;
                dy4 = Add(dy4, Vec4One());
            } while (y < MaxY);
            return false;
        }


        void                                    Show();
    public:

        CULLINLINE                      CCullRenderer()
        {
            m_ZBuffer = m_ZBufferMainMemory;
            m_DebugRender   =   0;
            m_nNumWorker = 0;
            m_ZBufferSwap = NULL;
        }

        ~CCullRenderer()
        {
            for (uint32 i = 0; i < m_nNumWorker; ++i)
            {
                CryModuleMemalignFree(m_ZBufferSwap[i]);
            }
            delete[] m_ZBufferSwap;
        }

        void Prepare()
        {
            if (m_nNumWorker)
            {
                return;
            }

            m_nNumWorker = AZ::JobContext::GetGlobalContext()->GetJobManager().GetNumWorkerThreads();
            m_ZBufferSwap = new tdZexel*[m_nNumWorker];
            for (uint32 i = 0; i < m_nNumWorker; ++i)
            {
                m_ZBufferSwap[i] = (tdZexel*)CryModuleMemalign(sizeof(tdZexel) * SIZEX * SIZEY, 128);
            }
        }

        CULLINLINE  void            Clear()
        {
            m_VMaxXY    =   NVMath::int32Tofloat(NVMath::Vec4(SIZEX, SIZEY, SIZEX, SIZEY));
            for (uint32 a = 0, S = SIZEX * SIZEY; a < S; a++)
            {
                m_ZBuffer[a] = 9999999999.f;
            }
            m_DrawCall = 0;
            m_PolyCount = 0;
        }

        bool DownLoadHWDepthBuffer(float nearPlane, float farPlane, float nearestMax, float Bias)
        {
            Matrix44A& Reproject = *reinterpret_cast<Matrix44A*>(&m_Reproject);

            m_VMaxXY    =   NVMath::int32Tofloat(NVMath::Vec4(SIZEX, SIZEY, SIZEX, SIZEY));

            if (!gEnv->pRenderer->GetOcclusionBuffer((uint16*)&m_ZBuffer[0], reinterpret_cast<Matrix44*>(&Reproject)))
            {
                return false;
            }

            for (uint32 i = 0; i < m_nNumWorker; ++i)
            {
                memset(m_ZBufferSwap[i], 0, SIZEX * SIZEY * sizeof(float));
            }
            memset(m_ZBufferSwapMerged, 0, SIZEX * SIZEY * sizeof(float));

            return true;
        }

        void ReprojectHWDepthBuffer(const Matrix44A& rCurrent, float nearPlane, float farPlane, float nearestMax, float Bias, int nStartLine, int nNumLines)
        {
            AZ_PROFILE_FUNCTION(AZ::Debug::ProfileCategory::Renderer);

            //#define USE_W_DEPTH
            //#define SCALE_DEPTH

            const uint32 workerThreadID = AZ::JobContext::GetGlobalContext()->GetJobManager().GetWorkerThreadId();
            CRY_ASSERT(workerThreadID != AZ::JobManager::InvalidWorkerThreadId);
            float* pZBufferSwap = m_ZBufferSwap[workerThreadID];

            int sizeX = SIZEX;
            int sizeY = SIZEY;

            float fWidth    = (float) sizeX;
            float fHeight = (float) sizeY;

            const float a = farPlane / (farPlane - nearPlane);
            const float b = farPlane * nearPlane / (nearPlane - farPlane);

            Matrix44A fromScreen;
            fromScreen.SetIdentity();
            fromScreen.SetTranslation(Vec3(-1.0f + 0.5f / fWidth, 1.0f - 0.5f / fHeight, 0.0f));
            fromScreen.m00 = 2.0f / fWidth;
            fromScreen.m11 = -2.0f / fHeight; // Y flipped
            fromScreen.Transpose();

            Matrix44A Reproject =   *reinterpret_cast<Matrix44A*>(&m_Reproject);
            Reproject.Invert();
            DEFINE_ALIGNED_DATA(Matrix44A, mToWorld, 16);
            mToWorld = fromScreen  * Reproject;

            {
                int x, y;
                float fY;
                using namespace NVMath;

#ifdef USE_W_DEPTH
                Matrix44A  mReproject = mToWorld * rCurrent;
                const vec4 MR0  =   reinterpret_cast<vec4*>(&mReproject)[0];
                const vec4 MR1  =   reinterpret_cast<vec4*>(&mReproject)[1];
                const vec4 MR2  =   reinterpret_cast<vec4*>(&mReproject)[2];
                const vec4 MR3  =   reinterpret_cast<vec4*>(&mReproject)[3];

                const vec4 vA = NVMath::Vec4(a);
                const vec4 vB = NVMath::Vec4(b);
#else
                const vec4 MW0  =   reinterpret_cast<vec4*>(&mToWorld)[0];
                const vec4 MW1  =   reinterpret_cast<vec4*>(&mToWorld)[1];
                const vec4 MW2  =   reinterpret_cast<vec4*>(&mToWorld)[2];
                const vec4 MW3  =   reinterpret_cast<vec4*>(&mToWorld)[3];

                const vec4 MS0  =   reinterpret_cast<const vec4*>(&rCurrent)[0];
                const vec4 MS1  =   reinterpret_cast<const vec4*>(&rCurrent)[1];
                const vec4 MS2  =   reinterpret_cast<const vec4*>(&rCurrent)[2];
                const vec4 MS3  =   reinterpret_cast<const vec4*>(&rCurrent)[3];
#endif

                const vec4 vXOffsets    = NVMath::Vec4(0.0f, 1.0f, 2.0f, 3.0f);
                const vec4 vXIncrement = NVMath:: Vec4(4.0f);

                const float nearestLinear = b / (nearestMax - a);
                const vec4 vfEpsilon = NVMath::Vec4Epsilon();
                const vec4 vfOne = NVMath::Vec4One();
                const vec4 vZero = NVMath::Vec4Zero();

                vec4* pSrcZ =   reinterpret_cast<vec4*>(&m_ZBuffer[nStartLine * sizeX]);

                for (y = nStartLine, fY = static_cast<float>(nStartLine); y < nStartLine + nNumLines; y++, fY += 1.0f)
                {
                    const vec4 vYYYY = NVMath::Vec4(fY);

                    vec4 vXCoords = vXOffsets;

                    for (x = 0; x < sizeX; x += 4)
                    {
                        const vec4 vNonLinearDepth = *pSrcZ;

                        vec4 vXXXX[4];
                        vXXXX[0] = Splat<0>(vXCoords);
                        vXXXX[1] = Splat<1>(vXCoords);
                        vXXXX[2] = Splat<2>(vXCoords);
                        vXXXX[3] = Splat<3>(vXCoords);

                        vec4 vZZZZ[4];
                        vZZZZ[0] = Splat<0>(vNonLinearDepth);
                        vZZZZ[1] = Splat<1>(vNonLinearDepth);
                        vZZZZ[2] = Splat<2>(vNonLinearDepth);
                        vZZZZ[3] = Splat<3>(vNonLinearDepth);

                        for (int i = 0; i < 4; i++)
                        {
#ifdef USE_W_DEPTH
                            vec4 vScreenPos     = Madd(MR0, vXXXX[i], Madd(MR1, vYYYY, Madd(MR2, vZZZZ[i], MR3)));

                            vec4 vScreenPosH    = Div(vScreenPos, Splat<3>(vScreenPos));

                            vec4 vNewDepth      = Div(vB, Sub(Splat<2>(vScreenPosH), vA));

                            float newDepth      = Vec4float<2>(vNewDepth);
#else
                            vec4 vWorldPos      = Madd(MW0, vXXXX[i], Madd(MW1, vYYYY, Madd(MW2, vZZZZ[i], MW3)));

                            vec4 vWorldPosH     = Div(vWorldPos, Max(Splat<3>(vWorldPos), vfEpsilon));

                            vec4 vScreenPos     = Madd(MS0, Splat<0>(vWorldPosH), Madd(MS1, Splat<1>(vWorldPosH), Madd(MS2, Splat<2>(vWorldPosH), MS3)));

                            vec4 vNewDepth      = Splat<2>(vScreenPos);

                            vec4 vScreenPosH    = Div(vScreenPos, Max(Splat<3>(vScreenPos), vfEpsilon));

                            float newDepth      = Vec4float<2>(vNewDepth);
#endif
                            // It is faster to use simple non-vectorized code to write the depth in the buffer

                            if (newDepth > 0.f)
                            {
                                int X;
                                int Y;
                                if (Vec4float<0>(vZZZZ[i]) < nearestMax)
                                {
                                    X = x + i;
                                    Y = y;
                                    newDepth = nearestLinear;
                                }
                                else
                                {
                                    vec4 vFinalScreenPosU = floatToint32(vScreenPosH);

                                    X = Vec4int32<0>(vFinalScreenPosU);
                                    Y = Vec4int32<1>(vFinalScreenPosU);
                                }

                                if (X >= 0 && Y >= 0 && X < sizeX && Y < sizeY)
                                {
                                    float* pDstZ = &pZBufferSwap[X + (Y * sizeX)];
                                    float depth = *pDstZ;

                                    depth = depth <= 0.f ? farPlane : depth;
                                    *pDstZ = min(depth, newDepth);
                                }
                            }
                        }
                        vXCoords = Add(vXIncrement, vXCoords);

                        pSrcZ++;
                    }
                }
            }
        }

        void MergeReprojectHWDepthBuffer(int nStartLine, int nNumLines)
        {
            AZ_PROFILE_FUNCTION(AZ::Debug::ProfileCategory::Renderer);

            const int sizeX = SIZEX;
            using namespace NVMath;

            const vec4 zero = Vec4Zero();

            for (uint32 i = 0; i < m_nNumWorker; ++i)
            {
                for (int y = nStartLine; y < nStartLine + nNumLines; y++)
                {
                    for (int x = 0; x < sizeX; x += 4)
                    {
                        vec4* pDstZ = reinterpret_cast<vec4*>(&m_ZBufferSwapMerged[x + (y * sizeX)]);
                        vec4 vDstZ = *pDstZ;

                        vec4* pSrcZ = reinterpret_cast<vec4*>(&m_ZBufferSwap[i][x + (y * sizeX)]);
                        vec4 vSrcZ = *pSrcZ;

                        // remove zeros so Min doesn't select them
                        vDstZ = Select(vDstZ, vSrcZ, CmpLE(vDstZ, zero));
                        vSrcZ = Select(vSrcZ, vDstZ, CmpLE(vSrcZ, zero));

                        const vec4 vNewDepth = Min(vSrcZ, vDstZ);

                        *pDstZ = vNewDepth;
                    }
                }
            }
        }

        void ReprojectHWDepthBufferAfterMerge(const Matrix44A& rCurrent, float nearPlane, float farPlane, float nearestMax, float Bias, int nStartLine, int nNumLines)
        {
            AZ_PROFILE_FUNCTION(AZ::Debug::ProfileCategory::Renderer);

            using namespace NVMath;
            int sizeX = SIZEX;
            int sizeY = SIZEY;
            const vec4 vFarPlane    = NVMath::Vec4(farPlane);

            float* pZBufferSwap =   m_ZBufferSwapMerged;
            vec4*   pSwap   =   reinterpret_cast<vec4*>(&pZBufferSwap[0]);
            vec4*   pDst    =   reinterpret_cast<vec4*>(&m_ZBuffer[nStartLine * sizeX]);

            const vec4  vBiasAdd    =   NVMath::Vec4(Bias < 0.f ? -Bias : 0.f);
            const vec4  vBiasMul    =   NVMath::Vec4(Bias > 0.f ? Bias : 0.f);
            const int       pitchX      = SIZEX / 4;

            vec4 zero = Vec4Zero();

            for (int y = nStartLine; y < nStartLine + nNumLines; y++)
            {
                int minY = max((int)0, (int)y - 1);
                int maxY = min((int)sizeY - 1, (int)y + 1);

                int maxX = min(pitchX - 1, 0 + 1);

                vec4 src[3];
                vec4 srcMax[3];
                vec4 srcCenter;

                // left, no data available yet
                srcMax[0] = zero;

                // center
                src[0]  = pSwap[0 + minY    * pitchX];
                src[1]  = pSwap[0 + y           * pitchX];
                src[2]  = pSwap[0 + maxY    * pitchX];
                srcMax[1] = Max(Max(src[0], src[1]), src[2]);
                srcCenter = src[1];

                // right
                src[0]  = pSwap[maxX + minY * pitchX];
                src[1]  = pSwap[maxX + y        * pitchX];
                src[2]  = pSwap[maxX + maxY * pitchX];
                srcMax[2] = Max(Max(src[0], src[1]), src[2]);

                int vecX = 0;
                for (int x = 0; x < sizeX; x += 4)  //todo, fix edge cases
                {
                    vec4 vDst;

                    vec4 vSrcIsZero = CmpLE(srcCenter, zero);

                    // 0
                    {
                        vec4 vLeft, vCenter;
                        vLeft = SelectStatic<0x8>(zero, srcMax[0]);
                        vCenter = SelectStatic<0x3>(zero, srcMax[1]);

                        vec4 _vMax;
                        _vMax = Max(vLeft, vCenter);
                        _vMax = Max(_vMax, Swizzle<zwxy>(_vMax));
                        _vMax = Max(_vMax, Swizzle<wzyx>(_vMax));

                        vDst = _vMax;
                    }

                    // 1
                    {
                        vec4 vCenter;

                        vCenter = SelectStatic<0x7>(zero, srcMax[1]);

                        vec4 _vMax;
                        _vMax = Max(vCenter, Swizzle<zwxy>(vCenter));
                        _vMax = Max(_vMax, Swizzle<wzyx>(_vMax));

                        vDst = SelectStatic<0x2>(vDst, _vMax);
                    }

                    // 2
                    {
                        vec4 vCenter;

                        vCenter = SelectStatic<0xE>(zero, srcMax[1]);

                        vec4 _vMax;
                        _vMax = Max(vCenter, Swizzle<zwxy>(vCenter));
                        _vMax = Max(_vMax, Swizzle<wzyx>(_vMax));

                        vDst = SelectStatic<0x4>(vDst, _vMax);
                    }

                    // 3
                    {
                        vec4 vRight, vCenter;

                        vRight = SelectStatic<0x1>(zero, srcMax[2]);
                        vCenter = SelectStatic<0xC>(zero, srcMax[1]);

                        vec4 _vMax;
                        _vMax = Max(vRight, vCenter);

                        _vMax = Max(_vMax, Swizzle<zwxy>(_vMax));
                        _vMax = Max(_vMax, Swizzle<wzyx>(_vMax));

                        vDst = SelectStatic<0x8>(vDst, _vMax);
                    }

                    vec4 vDstIsZero = CmpLE(vDst, zero);
                    vDst = Select(vDst, vFarPlane, vDstIsZero);

                    vDst = Select(srcCenter, vDst, vSrcIsZero);

                    vDst    =   Add(vDst, vBiasAdd);//linear bias
                    vDst    =   Add(vDst, Madd(vBiasMul, vDst, vBiasMul));// none-linear bias
#ifdef SCALE_DEPTH
                    //*pDst = Mul(vDst, NVMath::Vec4(1.2f));
                    *pDst = Add(vDst, NVMath::Vec4(0.5f));
#else
                    *pDst = vDst;
#endif

                    //next loop
                    ++pDst;
                    ++vecX;

                    // shift to the left
                    srcMax[0] = srcMax[1];
                    srcMax[1] = srcMax[2];
                    srcCenter   = src[1];

                    // load right data
                    maxX = min(pitchX - 1, vecX + 1);
                    src[0]  = pSwap[maxX    + minY  * pitchX];
                    src[1]  = pSwap[maxX    + y         * pitchX];
                    src[2]  = pSwap[maxX    + maxY  * pitchX];
                    srcMax[2] = Max(Max(src[0], src[1]), src[2]);
                }
            }

            //for(int a=0;a<128;a+=16)
            //  printf("%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f\n",
            //  m_ZBuffer[a+0],m_ZBuffer[a+1],m_ZBuffer[a+2],m_ZBuffer[a+3],
            //  m_ZBuffer[a+4],m_ZBuffer[a+5],m_ZBuffer[a+6],m_ZBuffer[a+7],
            //  m_ZBuffer[a+8],m_ZBuffer[a+9],m_ZBuffer[a+10],m_ZBuffer[a+11],
            //  m_ZBuffer[a+12],m_ZBuffer[a+13],m_ZBuffer[a+14],m_ZBuffer[a+15]);

#ifdef CULL_RENDERER_REPROJ_DEBUG
            memcpy(&pZBufferSwap[nStartLine * sizeX], &m_ZBuffer[nStartLine * sizeX], sizeX * nNumLines * sizeof(float));
#endif


#ifdef SCALE_DEPTH
#undef SCALE_DEPTH
#endif

#ifdef USE_W_DEPTH
#undef USE_W_DEPTH
#endif
        }

        CULLNOINLINE    int         AABBInFrustum(const NVMath::vec4* pViewProj, Vec3 Min, Vec3 Max, Vec3 ViewPos)
        {
            using namespace NVMath;
            const NVMath::vec4 M0   =   pViewProj[0];
            const NVMath::vec4 M1   =   pViewProj[1];
            const NVMath::vec4 M2   =   pViewProj[2];
            const NVMath::vec4 M3   =   pViewProj[3];
            const NVMath::vec4 MinX =   NVMath::Vec4(Min.x);
            const NVMath::vec4 MinY =   NVMath::Vec4(Min.y);
            const NVMath::vec4 MinZ =   NVMath::Vec4(Min.z);
            const NVMath::vec4 MaxX =   NVMath::Vec4(Max.x);
            const NVMath::vec4 MaxY =   NVMath::Vec4(Max.y);
            const NVMath::vec4 MaxZ =   NVMath::Vec4(Max.z);

            vec4    VB0     =   Madd(MinX, M0, Madd(MinY, M1, Madd(MinZ, M2, M3)));
            vec4    VB1     =   Madd(MinX, M0, Madd(MaxY, M1, Madd(MinZ, M2, M3)));
            vec4    VB2     =   Madd(MaxX, M0, Madd(MinY, M1, Madd(MinZ, M2, M3)));
            vec4    VB3     =   Madd(MaxX, M0, Madd(MaxY, M1, Madd(MinZ, M2, M3)));
            vec4    VB4     =   Madd(MinX, M0, Madd(MinY, M1, Madd(MaxZ, M2, M3)));
            vec4    VB5     =   Madd(MinX, M0, Madd(MaxY, M1, Madd(MaxZ, M2, M3)));
            vec4    VB6     =   Madd(MaxX, M0, Madd(MinY, M1, Madd(MaxZ, M2, M3)));
            vec4    VB7     =   Madd(MaxX, M0, Madd(MaxY, M1, Madd(MaxZ, M2, M3)));
            vec4 SMask  =   And(And(And(VB0, VB1), And(VB2, VB3)), And(Or(VB4, VB5), And(VB6, VB7)));
            if (SignMask(SMask) & BitZ)
            {
                return 0;
            }

            int Visible = 3;

            SMask   =   Or(Or(Or(VB0, VB1), Or(VB2, VB3)), Or(Or(VB4, VB5), Or(VB6, VB7)));
            if ((SignMask(SMask) & BitZ) == 0)
            {
                VB0 =   Div(VB0, Splat<3>(VB0));
                VB1 =   Div(VB1, Splat<3>(VB1));
                VB2 =   Div(VB2, Splat<3>(VB2));
                VB3 =   Div(VB3, Splat<3>(VB3));
                VB4 =   Div(VB4, Splat<3>(VB4));
                VB5 =   Div(VB5, Splat<3>(VB5));
                VB6 =   Div(VB6, Splat<3>(VB6));
                VB7 =   Div(VB7, Splat<3>(VB7));
                const vec4 VC0  =   Madd(VB0, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC1  =   Madd(VB1, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC2  =   Madd(VB2, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC3  =   Madd(VB3, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC4  =   Madd(VB4, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC5  =   Madd(VB5, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC6  =   Madd(VB6, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 VC7  =   Madd(VB7, NVMath::Vec4(-1.f), m_VMaxXY);
                const vec4 SMaskB   =   And(And(And(VB0, VB1), And(VB2, VB3)), And(And(VB4, VB5), And(VB6, VB7)));
                const vec4 SMaskC   =   And(And(And(VC0, VC1), And(VC2, VC3)), And(And(VC4, VC5), And(VC6, VC7)));
                if ((SignMask(SMaskB) & (BitX | BitY)) || (SignMask(SMaskC) & (BitX | BitY)))
                {
                    return 0;
                }
                Visible = 1;
            }
            //return true;
            if (Max.x < ViewPos.x)
            {
                if (Triangle<false, true, true>(VB3, VB2, VB7))
                {
                    return Visible;                                                                                 //MaxX
                }
                if (Triangle<false, true, true>(VB7, VB2, VB6))
                {
                    return Visible;
                }
                Visible &= ~1;
            }
            else
            if (Min.x > ViewPos.x)
            {
                if (Triangle<false, true, true>(VB0, VB1, VB4))
                {
                    return Visible;                                                                                 //MinX
                }
                if (Triangle<false, true, true>(VB4, VB1, VB5))
                {
                    return Visible;
                }
                Visible &= ~1;
            }
            if (Max.y < ViewPos.y)
            {
                if (Triangle<false, true, true>(VB1, VB3, VB5))
                {
                    return Visible | 1;                                                                             //MaxY
                }
                if (Triangle<false, true, true>(VB5, VB3, VB7))
                {
                    return Visible | 1;
                }
                Visible &= ~1;
            }
            else
            if (Min.y > ViewPos.y)
            {
                if (Triangle<false, true, true>(VB2, VB0, VB6))
                {
                    return Visible | 1;                                                                             //MinY
                }
                if (Triangle<false, true, true>(VB6, VB0, VB4))
                {
                    return Visible | 1;
                }
                Visible &= ~1;
            }
            if (Max.z < ViewPos.z)
            {
                if (Triangle<false, true, true>(VB4, VB5, VB6))
                {
                    return Visible | 1;                                                                             //MaxZ
                }
                if (Triangle<false, true, true>(VB6, VB5, VB7))
                {
                    return Visible | 1;
                }
                Visible = 0;
            }
            else
            if (Min.z > ViewPos.z)
            {
                if (Triangle<false, true, true>(VB1, VB0, VB3))
                {
                    return Visible | 1;                                                                             //MinZ
                }
                if (Triangle<false, true, true>(VB3, VB0, VB2))
                {
                    return Visible | 1;
                }
                Visible = 0;
            }
            return Visible & (Visible << 1);
        }


        CULLINLINE bool             TestQuad(const NVMath::vec4* pViewProj, const Vec3& vCenter, const Vec3& vAxisX, const Vec3& vAxisY)
        {
            const NVMath::vec4 M0   =   pViewProj[0];
            const NVMath::vec4 M1   =   pViewProj[1];
            const NVMath::vec4 M2   =   pViewProj[2];
            const NVMath::vec4 M3   =   pViewProj[3];

            const Vec3 v0 = vCenter - vAxisX - vAxisY;
            const Vec3 v1 = vCenter - vAxisX + vAxisY;
            const Vec3 v2 = vCenter + vAxisX + vAxisY;
            const Vec3 v3 = vCenter + vAxisX - vAxisY;

            const NVMath::vec4  VB0     =   NVMath::Madd(NVMath::Vec4(v0.x), M0, NVMath::Madd(NVMath::Vec4(v0.y), M1, NVMath::Madd(NVMath::Vec4(v0.z), M2, M3)));
            const NVMath::vec4  VB1     =   NVMath::Madd(NVMath::Vec4(v1.x), M0, NVMath::Madd(NVMath::Vec4(v1.y), M1, NVMath::Madd(NVMath::Vec4(v1.z), M2, M3)));
            const NVMath::vec4  VB2     =   NVMath::Madd(NVMath::Vec4(v2.x), M0, NVMath::Madd(NVMath::Vec4(v2.y), M1, NVMath::Madd(NVMath::Vec4(v2.z), M2, M3)));
            const NVMath::vec4  VB3     =   NVMath::Madd(NVMath::Vec4(v3.x), M0, NVMath::Madd(NVMath::Vec4(v3.y), M1, NVMath::Madd(NVMath::Vec4(v3.z), M2, M3)));

            // Note: Explicitly disabling backface culling here
            if (Triangle<false, true, false>(VB2, VB0, VB3))
            {
                return true;
            }
            if (Triangle<false, true, false>(VB1, VB0, VB2))
            {
                return true;
            }

            return false;
        }

        CULLNOINLINE    bool        TestAABB(const NVMath::vec4* pViewProj, Vec3 Min, Vec3 Max, Vec3 ViewPos)
        {
            using namespace NVMath;
            const NVMath::vec4 M0   =   pViewProj[0];
            const NVMath::vec4 M1   =   pViewProj[1];
            const NVMath::vec4 M2   =   pViewProj[2];
            const NVMath::vec4 M3   =   pViewProj[3];
            const NVMath::vec4 MinX =   NVMath::Vec4(Min.x);
            const NVMath::vec4 MinY =   NVMath::Vec4(Min.y);
            const NVMath::vec4 MinZ =   NVMath::Vec4(Min.z);
            const NVMath::vec4 MaxX =   NVMath::Vec4(Max.x);
            const NVMath::vec4 MaxY =   NVMath::Vec4(Max.y);
            const NVMath::vec4 MaxZ =   NVMath::Vec4(Max.z);

            const vec4  VB0     =   Madd(MinX, M0, Madd(MinY, M1, Madd(MinZ, M2, M3)));
            const vec4  VB1     =   Madd(MinX, M0, Madd(MaxY, M1, Madd(MinZ, M2, M3)));
            const vec4  VB2     =   Madd(MaxX, M0, Madd(MinY, M1, Madd(MinZ, M2, M3)));
            const vec4  VB3     =   Madd(MaxX, M0, Madd(MaxY, M1, Madd(MinZ, M2, M3)));
            const vec4  VB4     =   Madd(MinX, M0, Madd(MinY, M1, Madd(MaxZ, M2, M3)));
            const vec4  VB5     =   Madd(MinX, M0, Madd(MaxY, M1, Madd(MaxZ, M2, M3)));
            const vec4  VB6     =   Madd(MaxX, M0, Madd(MinY, M1, Madd(MaxZ, M2, M3)));
            const vec4  VB7     =   Madd(MaxX, M0, Madd(MaxY, M1, Madd(MaxZ, M2, M3)));
            vec4 SMask  =   Or(Or(Or(VB0, VB1), Or(VB2, VB3)), Or(Or(VB4, VB5), Or(VB6, VB7)));

            if (SignMask(SMask) & BitZ)
            {
                if (Max.x < ViewPos.x)
                {
                    if (Triangle<false, true, true>(VB3, VB2, VB7))
                    {
                        return true;                                                                                //MaxX
                    }
                    if (Triangle<false, true, true>(VB7, VB2, VB6))
                    {
                        return true;
                    }
                }
                if (Min.x > ViewPos.x)
                {
                    if (Triangle<false, true, true>(VB0, VB1, VB4))
                    {
                        return true;                                                                                //MinX
                    }
                    if (Triangle<false, true, true>(VB4, VB1, VB5))
                    {
                        return true;
                    }
                }
                if (Max.y < ViewPos.y)
                {
                    if (Triangle<false, true, true>(VB1, VB3, VB5))
                    {
                        return true;                                                                                //MaxY
                    }
                    if (Triangle<false, true, true>(VB5, VB3, VB7))
                    {
                        return true;
                    }
                }
                if (Min.y > ViewPos.y)
                {
                    if (Triangle<false, true, true>(VB2, VB0, VB6))
                    {
                        return true;                                                                                //MinY
                    }
                    if (Triangle<false, true, true>(VB6, VB0, VB4))
                    {
                        return true;
                    }
                }
                if (Max.z < ViewPos.z)
                {
                    if (Triangle<false, true, true>(VB4, VB5, VB6))
                    {
                        return true;                                                                                //MaxZ
                    }
                    if (Triangle<false, true, true>(VB6, VB5, VB7))
                    {
                        return true;
                    }
                }
                if (Min.z > ViewPos.z)
                {
                    if (Triangle<false, true, true>(VB1, VB0, VB3))
                    {
                        return true;                                                                                //MinZ
                    }
                    if (Triangle<false, true, true>(VB3, VB0, VB2))
                    {
                        return true;
                    }
                }
            }
            else
            {
                if (Max.x < ViewPos.x)
                {
                    //if(Quad2D(VB3,VB2,VB6,VB7))return true;
                    if (Triangle2D<false, true, true, true>(VB3, VB2, VB7))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB7, VB2, VB6))
                    {
                        return true;
                    }
                }
                if (Min.x > ViewPos.x)
                {
                    //if(Quad2D(VB0,VB1,VB5,VB4))return true;
                    if (Triangle2D<false, true, true, true>(VB0, VB1, VB4))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB4, VB1, VB5))
                    {
                        return true;
                    }
                }
                if (Max.y < ViewPos.y)
                {
                    //if(Quad2D(VB1,VB3,VB7,VB5))return true;
                    if (Triangle2D<false, true, true, true>(VB1, VB3, VB5))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB5, VB3, VB7))
                    {
                        return true;
                    }
                }
                if (Min.y > ViewPos.y)
                {
                    //if(Quad2D(VB2,VB0,VB4,VB6))return true;
                    if (Triangle2D<false, true, true, true>(VB2, VB0, VB6))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB6, VB0, VB4))
                    {
                        return true;
                    }
                }
                if (Max.z < ViewPos.z)
                {
                    //if(Quad2D(VB4,VB5,VB7,VB6))return true;
                    if (Triangle2D<false, true, true, true>(VB4, VB5, VB6))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB6, VB5, VB7))
                    {
                        return true;
                    }
                }
                if (Min.z > ViewPos.z)
                {
                    //if(Quad2D(VB1,VB0,VB2,VB3))return true;
                    if (Triangle2D<false, true, true, true>(VB1, VB0, VB3))
                    {
                        return true;
                    }
                    if (Triangle2D<false, true, true, true>(VB3, VB0, VB2))
                    {
                        return true;
                    }
                }
            }
            return false;
        }

        template<bool NEEDCLIPPING>
        CULLNOINLINE    void        Rasterize(const NVMath::vec4* pViewProj, const NVMath::vec4*   __restrict  pTriangles, size_t   TriCount)
        {
            using namespace NVMath;
            Prefetch<ECL_LVL1>(pTriangles);
            m_DrawCall++;
            m_PolyCount += TriCount;

            const vec4 M0   =   pViewProj[0];
            const vec4 M1   =   pViewProj[1];
            const vec4 M2   =   pViewProj[2];
            const vec4 M3   =   pViewProj[3];
            const size_t VCacheCount    =   48;                                        //16x3 vertices
            vec4 VTmp[VCacheCount];
            vec4 DetTmp[VCacheCount * 2 / 3];



            if (TriCount > 65535)
            {
                TriCount = 65535;
            }
            for (size_t a = 0, S = TriCount; a < S; a += VCacheCount)
            {
                vec4 ZMask  =   Vec4Zero();
                const size_t VTmpCount  =   VCacheCount + a > TriCount ? TriCount - a : VCacheCount;
                vec4* pVTmp =   VTmp;
                for (size_t b = 0; b < VTmpCount; b += 3, pVTmp += 3, pTriangles += 3)
                {
                    Prefetch<ECL_LVL1>(pTriangles + 48);
                    const vec4 VA   =   reinterpret_cast<const vec4*>(pTriangles)[0];
                    const vec4 VB   =   reinterpret_cast<const vec4*>(pTriangles)[1];
                    const vec4 VC   =   reinterpret_cast<const vec4*>(pTriangles)[2];

                    const vec4 V0 =  Madd(Splat<0>(VA), M0, Madd(Splat<1>(VA), M1, Madd(Splat<2>(VA), M2, M3)));
                    const vec4 V1 =  Madd(Splat<0>(VB), M0, Madd(Splat<1>(VB), M1, Madd(Splat<2>(VB), M2, M3)));
                    const vec4 V2 =  Madd(Splat<0>(VC), M0, Madd(Splat<1>(VC), M1, Madd(Splat<2>(VC), M2, M3)));
                    if (NEEDCLIPPING)
                    {
                        ZMask =  Or(Or(ZMask, V0), Or(V1, V2));
                    }
                    pVTmp[0]    =   V0;
                    pVTmp[1]    =   V1;
                    pVTmp[2]    =   V2;
                }

                const   uint32  Idx =   SignMask(ZMask) & BitZ;
                if (NEEDCLIPPING && Idx == BitZ)
                {
                    for (size_t b = 0; b < VTmpCount; b += 3)
                    {
                        Triangle<true, false, true>(VTmp[b], VTmp[b + 2], VTmp[b + 1]);
                    }
                }
                else
                {
                    pVTmp   =   VTmp;
                    const vec4 M    =   NVMath::Vec4(~0u, ~0u, 0u, ~0u);
                    pVTmp   =   VTmp;
                    vec4* pDetTmp   =   DetTmp;
                    for (size_t b = 0; b < VTmpCount; b += 12, pVTmp += 12, pDetTmp += 8)
                    {
                        vec4 V0 =   pVTmp[0];
                        vec4 V1 =   pVTmp[1];
                        vec4 V2 =   pVTmp[2];
                        vec4 V3 =   pVTmp[3];
                        vec4 V4 =   pVTmp[4];
                        vec4 V5 =   pVTmp[5];
                        vec4 V6 =   pVTmp[6];
                        vec4 V7 =   pVTmp[7];
                        vec4 V8 =   pVTmp[8];
                        vec4 V9 =   pVTmp[9];
                        vec4 VA =   pVTmp[10];
                        vec4 VB =   pVTmp[11];
                        const   vec4    W0123   =   Shuffle<xzxz>(Shuffle<wwww>(V0, V1), Shuffle<wwww>(V2, V3));
                        const   vec4    W4567   =   Shuffle<xzxz>(Shuffle<wwww>(V4, V5), Shuffle<wwww>(V6, V7));
                        const   vec4    W89AB   =   Shuffle<xzxz>(Shuffle<wwww>(V8, V9), Shuffle<wwww>(VA, VB));
                        const   vec4    iW0123  =   Rcp(W0123);
                        const   vec4    iW4567  =   Rcp(W4567);
                        const   vec4    iW89AB  =   Rcp(W89AB);
                        const vec4 V0T  =   Mul(V0, Splat<0>(iW0123));
                        const vec4 V1T  =   Mul(V1, Splat<1>(iW0123));
                        const vec4 V2T  =   Mul(V2, Splat<2>(iW0123));
                        const vec4 V3T  =   Mul(V3, Splat<3>(iW0123));
                        const vec4 V4T  =   Mul(V4, Splat<0>(iW4567));
                        const vec4 V5T  =   Mul(V5, Splat<1>(iW4567));
                        const vec4 V6T  =   Mul(V6, Splat<2>(iW4567));
                        const vec4 V7T  =   Mul(V7, Splat<3>(iW4567));
                        const vec4 V8T  =   Mul(V8, Splat<0>(iW89AB));
                        const vec4 V9T  =   Mul(V9, Splat<1>(iW89AB));
                        const vec4 VAT  =   Mul(VA, Splat<2>(iW89AB));
                        const vec4 VBT  =   Mul(VB, Splat<3>(iW89AB));
                        V0  =   SelectBits(V0, V0T, M);
                        V1  =   SelectBits(V1, V1T, M);
                        V2  =   SelectBits(V2, V2T, M);
                        V3  =   SelectBits(V3, V3T, M);
                        V4  =   SelectBits(V4, V4T, M);
                        V5  =   SelectBits(V5, V5T, M);
                        V6  =   SelectBits(V6, V6T, M);
                        V7  =   SelectBits(V7, V7T, M);
                        V8  =   SelectBits(V8, V8T, M);
                        V9  =   SelectBits(V9, V9T, M);
                        VA  =   SelectBits(VA, VAT, M);
                        VB  =   SelectBits(VB, VBT, M);
                        vec4    V012    =   Sub(Shuffle<xyxy>(V2T, V1T), Swizzle<xyxy>(V0T));
                        vec4    V345    =   Sub(Shuffle<xyxy>(V5T, V4T), Swizzle<xyxy>(V3T));
                        vec4    V678    =   Sub(Shuffle<xyxy>(V8T, V7T), Swizzle<xyxy>(V6T));
                        vec4    V9AB    =   Sub(Shuffle<xyxy>(VBT, VAT), Swizzle<xyxy>(V9T));
                        vec4    Det012 = Mul(V012, Swizzle<wzwz>(V012));
                        vec4    Det345 = Mul(V345, Swizzle<wzwz>(V345));
                        vec4    Det678 = Mul(V678, Swizzle<wzwz>(V678));
                        vec4    Det9AB = Mul(V9AB, Swizzle<wzwz>(V9AB));
                        Det012          =   Sub(Det012, Splat<1>(Det012));
                        Det345          =   Sub(Det345, Splat<1>(Det345));
                        Det678          =   Sub(Det678, Splat<1>(Det678));
                        Det9AB          =   Sub(Det9AB, Splat<1>(Det9AB));
                        vec4    Det     =   Shuffle<xzxz>(Shuffle<xxxx>(Det012, Det345), Shuffle<xxxx>(Det678, Det9AB));
#if !defined(LINUX) && !defined(APPLE)  //to avoid DivBy0 exception on PC
                        Det     =   Select(Det, NVMath::Vec4(-FLT_EPSILON), CmpEq(Det, Vec4Zero()));
#endif
                        Det                 =   Rcp(Det);
                        Det012          =   Splat<0>(Det);
                        Det345          =   Splat<1>(Det);
                        Det678          =   Splat<2>(Det);
                        Det9AB          =   Splat<3>(Det);

                        vec4 VMax012    =   Max(Max(V0T, V1T), V2T);
                        vec4 VMax345    =   Max(Max(V3T, V4T), V5T);
                        vec4 VMax678    =   Max(Max(V6T, V7T), V8T);
                        vec4 VMax9AB    =   Max(Max(V9T, VAT), VBT);
                        vec4 VMin012    =   Min(Min(V0T, V1T), V2T);
                        vec4 VMin345    =   Min(Min(V3T, V4T), V5T);
                        vec4 VMin678    =   Min(Min(V6T, V7T), V8T);
                        vec4 VMin9AB    =   Min(Min(V9T, VAT), VBT);
                        VMax012     =   Add(VMax012, Vec4One());
                        VMax345     =   Add(VMax345, Vec4One());
                        VMax678     =   Add(VMax678, Vec4One());
                        VMax9AB     =   Add(VMax9AB, Vec4One());
                        vec4 VMinMax012 =   Shuffle<xyxy>(VMin012, VMax012);
                        vec4 VMinMax345 =   Shuffle<xyxy>(VMin345, VMax345);
                        vec4 VMinMax678 =   Shuffle<xyxy>(VMin678, VMax678);
                        vec4 VMinMax9AB =   Shuffle<xyxy>(VMin9AB, VMax9AB);
                        VMinMax012  =   Max(VMinMax012, Vec4Zero());
                        VMinMax345  =   Max(VMinMax345, Vec4Zero());
                        VMinMax678  =   Max(VMinMax678, Vec4Zero());
                        VMinMax9AB  =   Max(VMinMax9AB, Vec4Zero());
                        VMinMax012  =   Min(VMinMax012, m_VMaxXY);
                        VMinMax345  =   Min(VMinMax345, m_VMaxXY);
                        VMinMax678  =   Min(VMinMax678, m_VMaxXY);
                        VMinMax9AB  =   Min(VMinMax9AB, m_VMaxXY);
                        VMinMax012  =   floatToint32(VMinMax012);
                        VMinMax345  =   floatToint32(VMinMax345);
                        VMinMax678  =   floatToint32(VMinMax678);
                        VMinMax9AB  =   floatToint32(VMinMax9AB);
                        VMinMax012  =   Or(VMinMax012, CmpLE(Det012, Vec4Zero()));                                      //backface cull
                        VMinMax345  =   Or(VMinMax345, CmpLE(Det345, Vec4Zero()));
                        VMinMax678  =   Or(VMinMax678, CmpLE(Det678, Vec4Zero()));
                        VMinMax9AB  =   Or(VMinMax9AB, CmpLE(Det9AB, Vec4Zero()));

                        pVTmp[0]    =   V0;
                        pVTmp[1]    =   V1;
                        pVTmp[2]    =   V2;
                        pVTmp[3]    =   V3;
                        pVTmp[4]    =   V4;
                        pVTmp[5]    =   V5;
                        pVTmp[6]    =   V6;
                        pVTmp[7]    =   V7;
                        pVTmp[8]    =   V8;
                        pVTmp[9]    =   V9;
                        pVTmp[10]   =   VA;
                        pVTmp[11]   =   VB;
                        pDetTmp[0]  =   VMinMax012;
                        pDetTmp[1]  =   Mul(V012, Det012);
                        pDetTmp[2]  =   VMinMax345;
                        pDetTmp[3]  =   Mul(V345, Det345);
                        pDetTmp[4]  =   VMinMax678;
                        pDetTmp[5]  =   Mul(V678, Det678);
                        pDetTmp[6]  =   VMinMax9AB;
                        pDetTmp[7]  =   Mul(V9AB, Det9AB);
                    }

                    pDetTmp =   DetTmp;
                    for (size_t b = 0; b < VTmpCount; b += 3, pDetTmp += 2)
                    {
                        const uint32* pMM       =   reinterpret_cast<uint32*>(pDetTmp);
                        const   uint16  MinX    =   pMM[0];
                        const uint16    MinY    =   pMM[1];
                        const uint16    MaxX    =   pMM[2];
                        const uint16    MaxY    =   pMM[3];
                        if (MinX < MaxX && MinY < MaxY)
                        {
                            Triangle2D<true, false, false, true>(VTmp[b], VTmp[b + 2], VTmp[b + 1], MinX, MinY, MaxX, MaxY, pDetTmp[0], pDetTmp[1]);
                        }
                    }
                }
            }
        }
        template<bool WRITE>
        CULLNOINLINE    bool        Rasterize(const NVMath::vec4* pViewProj, tdVertexCacheArg vertexCache,
            const tdIndex*  __restrict pIndices, const uint32 ICount,
            const uint8*        __restrict pVertices, const uint32 VertexSize, const uint32 VCount)
        {
            using namespace NVMath;
            if (!VCount || !ICount)
            {
                return false;
            }

            m_DrawCall++;
            m_PolyCount += VCount / 3;

            const vec4 M0   =   pViewProj[0];
            const vec4 M1   =   pViewProj[1];
            const vec4 M2   =   pViewProj[2];
            const vec4 M3   =   pViewProj[3];

            if (VCount + 1 > vertexCache.size())
            {
                vertexCache.resize(VCount + 1);
            }
            vec4* pVCache   =   &vertexCache[0];
            pVCache =   reinterpret_cast<vec4*>(((reinterpret_cast<size_t>(pVCache) + 15) & ~15));

            vec4 SMask  =   Vec4Zero();
            for (uint32 a = 0, S = VCount & ~3; a < S; a += 4)
            {
                const float* pV0    =   reinterpret_cast<const float*>(pVertices + a * VertexSize);
                const float* pV1    =   reinterpret_cast<const float*>(pVertices + (a + 1) * VertexSize);
                const float* pV2    =   reinterpret_cast<const float*>(pVertices + (a + 2) * VertexSize);
                const float* pV3    =   reinterpret_cast<const float*>(pVertices + (a + 3) * VertexSize);
                const vec4 V0   =   Madd(NVMath::Vec4(pV0[0]), M0, Madd(NVMath::Vec4(pV0[1]), M1, Madd(NVMath::Vec4(pV0[2]), M2, M3)));
                const vec4 V1   =   Madd(NVMath::Vec4(pV1[0]), M0, Madd(NVMath::Vec4(pV1[1]), M1, Madd(NVMath::Vec4(pV1[2]), M2, M3)));
                const vec4 V2   =   Madd(NVMath::Vec4(pV2[0]), M0, Madd(NVMath::Vec4(pV2[1]), M1, Madd(NVMath::Vec4(pV2[2]), M2, M3)));
                const vec4 V3   =   Madd(NVMath::Vec4(pV3[0]), M0, Madd(NVMath::Vec4(pV3[1]), M1, Madd(NVMath::Vec4(pV3[2]), M2, M3)));
                SMask   =   Or(SMask, V0);
                SMask   =   Or(SMask, V1);
                SMask   =   Or(SMask, V2);
                SMask   =   Or(SMask, V3);
                pVCache[a  ]    =   V0;
                pVCache[a + 1]    =   V1;
                pVCache[a + 2]    =   V2;
                pVCache[a + 3]    =   V3;
            }
            for (uint32 a = VCount & ~3, S = VCount; a < S; a++)
            {
                const float* pV =   reinterpret_cast<const float*>(pVertices + a * VertexSize);
                const vec4 V    =   Madd(NVMath::Vec4(pV[0]), M0, Madd(NVMath::Vec4(pV[1]), M1, Madd(NVMath::Vec4(pV[2]), M2, M3)));
                SMask   =   Or(SMask, V);
                pVCache[a]      =   V;
            }

            bool Visible = false;
            if (SignMask(SMask) & BitZ)
            {
                for (uint32 a = 0; a < ICount; a += 3)
                {
                    vec4 Pos0   =   pVCache[pIndices[a]];
                    vec4 Pos2   =   pVCache[pIndices[a + 1]];
                    vec4 Pos1   =   pVCache[pIndices[a + 2]];
                    Visible |= Triangle<WRITE, true>(Pos0, Pos1, Pos2);
                    if (!WRITE && Visible)
                    {
                        return true;
                    }
                }
            }
            else
            {
                for (uint32 a = 0; a < ICount; a += 3)
                {
                    vec4 Pos0   =   pVCache[pIndices[a]];
                    vec4 Pos2   =   pVCache[pIndices[a + 1]];
                    vec4 Pos1   =   pVCache[pIndices[a + 2]];
                    Visible |= Triangle2D<WRITE, true>(Pos0, Pos1, Pos2);
                    if (!WRITE && Visible)
                    {
                        return true;
                    }
                }
            }
            return Visible;
        }

        int m_DebugRender;

        void                                    DrawDebug(IRenderer* pRenderer, int32 nStep)
        {                                         // project buffer to the screen
#if defined(CULLING_ENABLE_DEBUG_OVERLAY)
            nStep   %=  32;
            if (!nStep)
            {
                return;
            }

            //if(!m_DebugRender)
            //  return;

            const float FarPlaneInv =   255.f / pRenderer->GetCamera().GetFarPlane();

            SAuxGeomRenderFlags oFlags(e_Def2DPublicRenderflags);
            oFlags.SetDepthTestFlag(e_DepthTestOff);
            oFlags.SetDepthWriteFlag(e_DepthWriteOff);
            oFlags.SetCullMode(e_CullModeNone);
            oFlags.SetAlphaBlendMode(e_AlphaNone);
            pRenderer->GetIRenderAuxGeom()->SetRenderFlags(oFlags);

            int nScreenHeight = gEnv->pRenderer->GetHeight();
            int nScreenWidth = gEnv->pRenderer->GetWidth();

            float fScreenHeight = (float)nScreenHeight;
            float fScreenWidth = (float)nScreenWidth;

            float fTopOffSet = 35.0f;
            float fSideOffSet = 35.0f;

            // draw z-buffer after reprojection (unknown parts are red)
            fTopOffSet += 200.0f;
            for (uint32 y = 0; y < SIZEY; y += 1)
            {
                const float* __restrict pVMemZ  =   alias_cast<float*>(&m_ZBuffer[y * SIZEX]);
                float fY = fTopOffSet + (y * 3);
                for (uint32 x = 0; x < SIZEX; x += 4)
                {
                    float fX0 = fSideOffSet + ((x + 0) * 3);
                    float fX1 = fSideOffSet + ((x + 1) * 3);
                    float fX2 = fSideOffSet + ((x + 2) * 3);
                    float fX3 = fSideOffSet + ((x + 3) * 3);

                    //ColorB ValueColor0  = ((ColorB*)pVMemZ)[x+0];
                    //ColorB ValueColor1  = ((ColorB*)pVMemZ)[x+1];
                    //ColorB ValueColor2  = ((ColorB*)pVMemZ)[x+2];
                    //ColorB ValueColor3  = ((ColorB*)pVMemZ)[x+3];
                    ////ColorB color0=ColorB(ValueColor0,ValueColor0,ValueColor0,222);
                    ////ColorB color1=ColorB(ValueColor1,ValueColor1,ValueColor1,222);
                    ////ColorB color2=ColorB(ValueColor2,ValueColor2,ValueColor2,222);
                    ////ColorB color3=ColorB(ValueColor3,ValueColor3,ValueColor3,222);
                    //
                    //NAsyncCull::Debug::Draw2DBox(fX0,fY,3.0f,3.0f,ValueColor0, fScreenHeight,fScreenWidth,pRenderer->GetIRenderAuxGeom());
                    //NAsyncCull::Debug::Draw2DBox(fX1,fY,3.0f,3.0f,ValueColor1, fScreenHeight,fScreenWidth,pRenderer->GetIRenderAuxGeom());
                    //NAsyncCull::Debug::Draw2DBox(fX2,fY,3.0f,3.0f,ValueColor2, fScreenHeight,fScreenWidth,pRenderer->GetIRenderAuxGeom());
                    //NAsyncCull::Debug::Draw2DBox(fX3,fY,3.0f,3.0f,ValueColor3, fScreenHeight,fScreenWidth,pRenderer->GetIRenderAuxGeom());
                    uint32 ValueColor0  = (uint32)(pVMemZ[x + 0]);
                    uint32 ValueColor1  = (uint32)(pVMemZ[x + 1]);
                    uint32 ValueColor2  = (uint32)(pVMemZ[x + 2]);
                    uint32 ValueColor3  = (uint32)(pVMemZ[x + 3]);
                    ColorB Color0(ValueColor0, ValueColor0 * 16, ValueColor0 * 256, 222);
                    ColorB Color1(ValueColor1, ValueColor1 * 16, ValueColor1 * 256, 222);
                    ColorB Color2(ValueColor2, ValueColor2 * 16, ValueColor2 * 256, 222);
                    ColorB Color3(ValueColor3, ValueColor3 * 16, ValueColor3 * 256, 222);

                    NAsyncCull::Debug::Draw2DBox(fX0, fY, 3.0f, 3.0f, Color0, fScreenHeight, fScreenWidth, pRenderer->GetIRenderAuxGeom());
                    NAsyncCull::Debug::Draw2DBox(fX1, fY, 3.0f, 3.0f, Color1, fScreenHeight, fScreenWidth, pRenderer->GetIRenderAuxGeom());
                    NAsyncCull::Debug::Draw2DBox(fX2, fY, 3.0f, 3.0f, Color2, fScreenHeight, fScreenWidth, pRenderer->GetIRenderAuxGeom());
                    NAsyncCull::Debug::Draw2DBox(fX3, fY, 3.0f, 3.0f, Color3, fScreenHeight, fScreenWidth, pRenderer->GetIRenderAuxGeom());
                }
            }
#endif
        }

        CULLINLINE  uint32      SizeX() const{return SIZEX; }
        CULLINLINE  uint32      SizeY() const{return SIZEY; }
    };
}

template<uint32 SIZEX, uint32 SIZEY>
_MS_ALIGN(128) float NAsyncCull::CCullRenderer<SIZEX, SIZEY>::m_ZBufferMainMemory[SIZEX * SIZEY] _ALIGN(128);