/* * All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or * its licensors. * * For complete copyright and license terms please see the LICENSE at the root of this * distribution (the "License"). All use of this software is governed by the License, * or, if provided, by the license below or the license accompanying this file. Do not * remove or modify any license notices. This file is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * */ // Original file Copyright Crytek GMBH or its affiliates, used under license. // Description : Misc mathematical functions #pragma once #include // Section dictionary #if defined(AZ_RESTRICTED_PLATFORM) #undef AZ_RESTRICTED_SECTION #define MEMORYACCESS_H_SECTION_TRAITS 1 #define MEMORYACCESS_H_SECTION_CRYPREFETCH 2 #endif // Traits #if defined(AZ_RESTRICTED_PLATFORM) #define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_TRAITS #if defined(AZ_PLATFORM_XENIA) #include "Xenia/MemoryAccess_h_xenia.inl" #elif defined(AZ_PLATFORM_PROVO) #include "Provo/MemoryAccess_h_provo.inl" #elif defined(AZ_PLATFORM_SALEM) #include "Salem/MemoryAccess_h_salem.inl" #endif #else #define MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE 1 #endif #if MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE #define PrefetchLine(ptr, off) cryPrefetchT0SSE((void*)((UINT_PTR)ptr + off)) #else #define PrefetchLine(ptr, off) (void)(0) #endif #define ResetLine128(ptr, off) (void)(0) #define FlushLine128(ptr, off) (void)(0) //======================================================================================== // cryMemcpy flags #define MC_CPU_TO_GPU 0x10 #define MC_GPU_TO_CPU 0x20 #define MC_CPU_TO_CPU 0x40 extern int g_CpuFlags; // #define CPUF_SSE 0x01 #define CPUF_SSE2 0x02 #define CPUF_3DNOW 0x04 #define CPUF_MMX 0x08 #define CPUF_SSE3 0x10 #define CPUF_F16C 0x20 #define CPUF_SSE41 0x40 #ifdef _CPU_SSE #ifdef _CPU_X86 #include #endif #define _MM_PREFETCH(MemPtr, Hint) _mm_prefetch((MemPtr), (Hint)); #define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint) { for (int p = 0; p < nCount; p += 64) { _mm_prefetch((const char*)(MemPtr) + p, Hint); } \ } #else //_CPU_SSE #define _MM_PREFETCH(MemPtr, Hint) #define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint) #endif //_CPU_SSE void cryMemcpy(void* Dst, const void* Src, int Count); #if defined(LINUX) || defined(APPLE) // Define this for Mac and Linux since it is used with the pthread sources #define mymemcpy16 memcpy #endif //========================================================================================== // 3DNow! optimizations #pragma warning(push) #pragma warning(disable:4731) // frame pointer register 'ebp' modified by inline assembly code #if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE) // *************************************************************************** inline void cryPrecacheSSE(const void* src, int nbytes) { _asm { mov esi, src mov ecx, nbytes // 64 bytes per pass shr ecx, 6 jz endLabel loopMemToL1: prefetchnta 64[ESI] // Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI]// Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] add esi, 64 dec ecx jnz loopMemToL1 emms endLabel: } } #endif ILINE void cryPrefetchT0SSE(const void* src) { #if defined(WIN32) && !defined(WIN64) _asm { mov esi, src prefetchT0 [ESI] // Prefetch } #else _MM_PREFETCH((char*)src, _MM_HINT_T0); #endif } //================================================================================= // Very optimized memcpy() routine for AMD Athlon and Duron family. // This code uses any of FOUR different basic copy methods, depending // on the transfer size. // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or // "Streaming Store"), and also uses the software prefetch instructions, // be sure you're running on Athlon/Duron or other recent CPU before calling! #define TINY_BLOCK_COPY 64 // Upper limit for movsd type copy. // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". #define IN_CACHE_COPY 64 * 1024 // Upper limit for movq/movq copy w/SW prefetch. // Next is a copy that uses the MMX registers to copy 8 bytes at a time, // also using the "unrolled loop" optimization. This code uses // the software prefetch instruction to get the data into the cache. #define UNCACHED_COPY 197 * 1024 // Upper limit for movq/movntq w/SW prefetch. // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE". #define BLOCK_PREFETCH_COPY infinity // No limit for movq/movntq w/block prefetch. #define CACHEBLOCK 80h // Number of 64-byte blocks (cache lines) for block prefetch. // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. #if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE) // Inline assembly syntax for use with Visual C++ inline void cryMemcpy(void* Dst, const void* Src, int Count) { if (g_CpuFlags & CPUF_SSE) { __asm { mov ecx, [Count]; number of bytes to copy mov edi, [Dst]; destination mov esi, [Src]; source mov ebx, ecx; keep a copy of count cld cmp ecx, TINY_BLOCK_COPY jb $memcpy_ic_3; tiny ? skip mmx copy cmp ecx, 32 * 1024; dont align between 32k - 64k because jbe $memcpy_do_align; it appears to be slower cmp ecx, 64*1024 jbe $memcpy_align_done $memcpy_do_align : mov ecx, 8; a trick thats faster than rep movsb ... sub ecx, edi; align destination to qword and ecx, 111b; get the low bits sub ebx, ecx; update copy count neg ecx; set up to jump into the array add ecx, offset $memcpy_align_done jmp ecx; jump to array of movsbs align 4 movsb movsb movsb movsb movsb movsb movsb movsb $memcpy_align_done:; destination is dword aligned mov ecx, ebx; number of bytes left to copy shr ecx, 6; get 64 - byte block count jz $memcpy_ic_2; finish the last few bytes cmp ecx, IN_CACHE_COPY / 64; too big 4 cache ? use uncached copy jae $memcpy_uc_test // This is small block copy that uses the MMX registers to copy 8 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. align 16 $memcpy_ic_1 :; 64 - byte block copies, in - cache copy prefetchnta [esi + (200 * 64 / 34 + 192)]; start reading ahead movq mm0, [esi + 0]; read 64 bits movq mm1, [esi + 8] movq [edi + 0], mm0; write 64 bits movq [edi + 8], mm1; note: the normal movq writes the movq mm2, [esi + 16]; data to cache; a cache line will be movq mm3, [esi + 24]; allocated as needed, to store the data movq [edi + 16], mm2 movq [edi + 24], mm3 movq mm0, [esi + 32] movq mm1, [esi + 40] movq [edi + 32], mm0 movq [edi + 40], mm1 movq mm2, [esi + 48] movq mm3, [esi + 56] movq [edi + 48], mm2 movq [edi + 56], mm3 add esi, 64; update source pointer add edi, 64; update destination pointer dec ecx; count down jnz $memcpy_ic_1; last 64 - byte block ? $memcpy_ic_2 : mov ecx, ebx; has valid low 6 bits of the byte count $memcpy_ic_3: shr ecx, 2; dword count and ecx, 1111b; only look at the "remainder" bits neg ecx; set up to jump into the array add ecx, offset $memcpy_last_few jmp ecx; jump to array of movsds $memcpy_uc_test: cmp ecx, UNCACHED_COPY / 64; big enough ? use block prefetch copy jae $memcpy_bp_1 $memcpy_64_test : or ecx, ecx; tail end of block prefetch will jump here jz $memcpy_ic_2; no more 64 - byte blocks left // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. align 16 $memcpy_uc_1:; 64 - byte blocks, uncached copy prefetchnta [esi + (200 * 64 / 34 + 192)]; start reading ahead movq mm0, [esi + 0]; read 64 bits add edi, 64; update destination pointer movq mm1, [esi + 8] add esi, 64; update source pointer movq mm2, [esi - 48] movntq [edi - 64], mm0; write 64 bits, bypassing the cache movq mm0, [esi - 40]; note: movntq also prevents the CPU movntq [edi - 56], mm1; from READING the destination address movq mm1, [esi - 32]; into the cache, only to be over - written movntq [edi - 48], mm2; so that also helps performance movq mm2, [esi - 24] movntq [edi - 40], mm0 movq mm0, [esi - 16] movntq [edi - 32], mm1 movq mm1, [esi - 8] movntq [edi - 24], mm2 movntq [edi - 16], mm0 dec ecx movntq [edi - 8], mm1 jnz $memcpy_uc_1; last 64 - byte block ? jmp $memcpy_ic_2; almost done // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. $memcpy_bp_1 :; large blocks, block prefetch copy cmp ecx, CACHEBLOCK; big enough to run another prefetch loop ? jl $memcpy_64_test; no, back to regular uncached copy mov eax, CACHEBLOCK / 2; block prefetch loop, unrolled 2X add esi, CACHEBLOCK* 64; move to the top of the block align 16 $memcpy_bp_2 : mov edx, [esi - 64]; grab one address per cache line mov edx, [esi - 128]; grab one address per cache line sub esi, 128; go reverse order to suppress HW prefetcher dec eax; count down the cache lines jnz $memcpy_bp_2; keep grabbing more lines into cache mov eax, CACHEBLOCK; now that its in cache, do { the copy align 16 $memcpy_bp_3: movq mm0, [esi ]; } read 64 bits movq mm1, [esi + 8] movq mm2, [esi + 16] movq mm3, [esi + 24] movq mm4, [esi + 32] movq mm5, [esi + 40] movq mm6, [esi + 48] movq mm7, [esi + 56] add esi, 64; update source pointer movntq [edi ], mm0; write 64 bits, bypassing cache movntq [edi + 8], mm1; note: movntq also prevents the CPU movntq [edi + 16], mm2; from READING the destination address movntq [edi + 24], mm3; into the cache, only to be over - written, movntq [edi + 32], mm4; so that also helps performance movntq [edi + 40], mm5 movntq [edi + 48], mm6 movntq [edi + 56], mm7 add edi, 64; update dest pointer dec eax; count down jnz $memcpy_bp_3; keep copying sub ecx, CACHEBLOCK; update the 64 - byte block count jmp $memcpy_bp_1; keep processing chunks // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". Then it handles the last few bytes. align 4 movsd movsd; perform last 1 - 15 dword copies movsd movsd movsd movsd movsd movsd movsd movsd; perform last 1 - 7 dword copies movsd movsd movsd movsd movsd movsd $memcpy_last_few:; dword aligned from before movsds mov ecx, ebx; has valid low 2 bits of the byte count and ecx, 11b; the last few cows must come home jz $memcpy_final; no more, lets leave rep movsb; the last 1, 2, or 3 bytes $memcpy_final: emms; clean up the MMX state sfence; flush the write buffer // mov eax, [dest] ; ret value = destination pointer } } else { memcpy(Dst, Src, Count); } } inline void cryPrefetch(const void* Src, int nCount) { nCount >>= 6; if (nCount > 0) { _asm { mov esi, Src; mov ecx, nCount; mPr0: align 16 dec ecx; mov eax, [esi]; mov eax, 0; lea esi, [esi + 40h]; jne mPr0; } } else { _asm { mov esi, Src; mov ecx, nCount; mPr1: align 16 inc ecx; mov eax, [esi]; mov eax, 0; lea esi, [esi - 40h]; jne mPr1; } } } inline void cryMemcpy (void* inDst, const void* inSrc, int nCount, int nFlags) { cryMemcpy(inDst, inSrc, nCount); } //========================================================================================== // SSE optimizations #else const int PREFNTA_BLOCK = 0x4000; ILINE void cryMemcpy(void* Dst, const void* Src, int n) { char* dst = (char*)Dst; char* src = (char*)Src; while (n > PREFNTA_BLOCK) { _MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA); memcpy(dst, src, PREFNTA_BLOCK); src += PREFNTA_BLOCK; dst += PREFNTA_BLOCK; n -= PREFNTA_BLOCK; } _MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA); memcpy(dst, src, n); } ILINE void cryMemcpy(void* Dst, const void* Src, int n, int nFlags) { char* dst = (char*)Dst; char* src = (char*)Src; while (n > PREFNTA_BLOCK) { _MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA); memcpy(dst, src, PREFNTA_BLOCK); src += PREFNTA_BLOCK; dst += PREFNTA_BLOCK; n -= PREFNTA_BLOCK; } _MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA); memcpy(dst, src, n); } #endif #pragma warning(pop) #if defined(AZ_RESTRICTED_PLATFORM) #define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_CRYPREFETCH #if defined(AZ_PLATFORM_XENIA) #include "Xenia/MemoryAccess_h_xenia.inl" #elif defined(AZ_PLATFORM_PROVO) #include "Provo/MemoryAccess_h_provo.inl" #elif defined(AZ_PLATFORM_SALEM) #include "Salem/MemoryAccess_h_salem.inl" #endif #endif #if defined(AZ_RESTRICTED_SECTION_IMPLEMENTED) #undef AZ_RESTRICTED_SECTION_IMPLEMENTED #else //implement something usual to bring one memory location into L1 data cache ILINE void CryPrefetch(const void* const cpSrc) { cryPrefetchT0SSE(cpSrc); } #endif #define CryPrefetchInl CryPrefetch