/*
	relocatable and on demand linkable version of Memcpy__VMM
*/ 

#if defined(PS3)
#if defined(__SPU__)

#ifndef eCryModule
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"

//non overlapping memcpy for main -> main
//	cpSrc gets flushed and pDest invalidated
//	memory is DMA'd in 4 KB parts
//	special care needs to be taken not both pointers are 16 byte aligned
void* Memcpy__VMM(void* const pDest, const void* const cpSrc, const uint32 cSize)
{
	IF(cSize == 0, 0)
		return pDest;
	assert(*(uint32* __restrict)G_SPU_NUM_SETS>0);
	//overlapping test
 	IF(((uint32)pDest <= (uint32)cpSrc && ((uint32)cpSrc-(uint32)pDest) < cSize ) ||
		 ((uint32)cpSrc <= (uint32)pDest && ((uint32)pDest-(uint32)cpSrc) < cSize ), false)
 	{	snPause(); }

	//fast mode can be issued if both pointers have the same offset within 16 bytes
	const bool cFastMode = ((uint32)cpSrc & 0xF) == ((uint32)pDest & 0xF);

#if defined(DO_SPU_PROFILING)
	g_ProfID = (vec_uint4)PROF_ID_MEMCPY_VMM;
#endif

	//fast mode does bypass the cache entirely (flushes and invalidates beforehand)
	if(cFastMode)
	{
		//uses a little more than 2 KB, best trade off between speed and stack
		const uint32 cLocBufCount				= 4;		//local buffers 
		const uint32 cLocBufCountShift	= 2;		//corresponding left shift value
		const uint32 cLocBufSize				= 256;	//size of each buffer
		const uint32 cLocBufSizeShift		= 8;		//corresponding left shift value

		uint8 tempBuf[cLocBufCount * cLocBufSize * 2] _ALIGN(128);
		uint8 tempBuf16[16] _ALIGN(16);
		uint8 tempBuf16Pre[16] _ALIGN(16);
		uint32 curDBOffset	= 0;//current double buffer offset, 0/1
		uint32 dmaActive[cLocBufCount * 2] = {0};//keeps track which destination DMAs have to be syncd
		//flush cache range of cpSrc
		FlushCacheRange((uint32)cpSrc, cSize, true);//write back
		FlushCacheRange((uint32)pDest, cSize, true);//write back
		//start at next 16 byte aligned address
		uint32 curSrcAddr  = ((uint32)cpSrc + 0xF) & ~0xF;
		uint32 curDestAddr = ((uint32)pDest + 0xF) & ~0xF;
		const uint32 cPreBytes = curSrcAddr - (uint32)cpSrc;

		const uint32 cClampedSize		= (cSize - cPreBytes) & ~0xF;//transfer only the 16 byte clamped part
		const uint32 cRemainingSize = (cSize - cPreBytes) & 0xF;
		const uint32 cBufCount		= (cClampedSize + cLocBufSize-1) >> cLocBufSizeShift;
		const uint32 cIterCount		= (cBufCount + cLocBufCount-1) >> cLocBufCountShift;
		uint32 remainingBufCount	= cBufCount;
		uint32 remainingSrcSize		= cClampedSize;
		uint32 remainingDestSize	= cClampedSize;

		IF(cPreBytes != 0, false)
		{
			//transfer preceding bytes to local store
			MemcpyLS(tempBuf16Pre, (uint32)cpSrc & ~0xF, 16, g_scDMAPPUProfTag);
		}

		IF(cRemainingSize != 0, false)
		{
			//transfer remaining bytes to local store
			MemcpyLS(tempBuf16, ((uint32)cpSrc + cSize) & ~0xF, 16, g_scMemCpyTempTag1);
		}
		for(unsigned int i=0; i<cIterCount; ++i)
		{
			//copy to local memory
			const uint32 cBufLimit0 = CondSelMin(remainingBufCount, cLocBufCount);
			remainingBufCount -= cBufLimit0;
			const uint32 cBufLimit1 = CondSelMin(remainingBufCount, cBufLimit0);
			for(unsigned int j=0; j<cBufLimit0; ++j)
			{
				const uint32 cCurSize = CondSelMin(remainingSrcSize, cLocBufSize);
				MemcpyLSFenced(&tempBuf[(j) << cLocBufSizeShift], curSrcAddr, cCurSize, g_scMemCpyTag + j);
				curSrcAddr += cCurSize;
				dmaActive[j] = 1;
				remainingSrcSize -= cCurSize;
			}
			for(unsigned int j=0; j<cBufLimit1; ++j)
			{
				const uint32 cCurSize = CondSelMin(remainingSrcSize, cLocBufSize);
				MemcpyLSFenced(&tempBuf[(j+cLocBufCount) << cLocBufSizeShift], curSrcAddr, cCurSize, g_scMemCpyTag + j + cLocBufCount);
				curSrcAddr += cCurSize;
				dmaActive[j + cLocBufCount] = 1;
				remainingSrcSize -= cCurSize;
			}

			//copy back to main memory
			for(unsigned int j=0; j<cBufLimit0; ++j)
			{
				const uint32 cCurSize = CondSelMin(remainingDestSize, cLocBufSize);
				MemcpyMainFenced(curDestAddr, &tempBuf[(j) << cLocBufSizeShift], cCurSize, g_scMemCpyTag + j);
				curDestAddr += cCurSize;
				remainingDestSize -= cCurSize;
			}
			for(unsigned int j=0; j<cBufLimit1; ++j)
			{
				const uint32 cCurSize = CondSelMin(remainingDestSize, cLocBufSize);
				MemcpyMainFenced(curDestAddr, &tempBuf[(j + cLocBufCount) << cLocBufSizeShift], cCurSize, g_scMemCpyTag + j + cLocBufCount);
				curDestAddr += cCurSize;
				remainingDestSize -= cCurSize;
			}
			remainingBufCount -= cBufLimit1;
		}
		for(unsigned int j=0; j<cLocBufCount*2; ++j)
			if(dmaActive[j])	
				SyncMemory(g_scMemCpyTag + j);//sync large transfers before any cache op
		IF(cRemainingSize, false)
		{
			//copy back to main memory, use cache
			SyncMemory(g_scMemCpyTempTag1);
			uint32 cDestAddr  = ((uint32)pDest + cSize) & ~0xF;
			uint8 *pSPU = (uint8*)__spu_cache_lookup((uint32)cDestAddr, 1, 128);
			for(unsigned int i=0; i<cRemainingSize; ++i)
				pSPU[i] = tempBuf16[i];
		}
		IF(cPreBytes != 0, false)
		{
			//transfer preceding bytes back to main memory, use cache
			const uint8 *pTempBufPtr = &tempBuf16Pre[(uint32)cpSrc & 0xF];
			SyncMemory(g_scDMAPPUProfTag);
			uint8 *pSPU = (uint8*)__spu_cache_lookup((uint32)pDest, 1, 128);
			for(unsigned int i=0; i<cPreBytes; ++i)
				pSPU[i] = pTempBufPtr[i];
		}
	}
	else
	{
		//copy byte wise and use the cache, both pointers have different offsets within 16 byte
		//for simplicity and code compactness, assume the dest cache line ends before the src one
		//	(not optimal in case the source cache line ends before, will result in one unnecessary lookup and loop start)
		uint32 curSrcPPUEA	= (uint32)cpSrc;
		uint32 curDestPPUEA = (uint32)pDest;
		uint32 remSrcCount	= scSPUCacheLineSize - ((uint32)cpSrc & scSPUCacheLineSizeMask);
		uint32 remDestCount = scSPUCacheLineSize - ((uint32)pDest & scSPUCacheLineSizeMask);
		uint8 *pDestSPU			= (uint8*)__spu_cache_lookup((uint32)pDest, 1, 128);
		uint8 *pSrcSPU;
		int remSize = (int)cSize;
		while(remSize)
		{
			remSrcCount  = CondSelMin((uint32)remSize, remSrcCount);
			remDestCount = CondSelMin(remDestCount, remSrcCount);//do not let it span across src cache line
			pSrcSPU			 = (uint8*)__spu_cache_lookup(curSrcPPUEA, 1, 128);
			//iterate til the end of the current dest cache line
			for(unsigned int i=0; i<remDestCount; ++i)
				*pDestSPU++ = *pSrcSPU++;
			remSize -= remDestCount;
			curSrcPPUEA  += remDestCount;
			curDestPPUEA += remDestCount;
			remSrcCount -= remDestCount;
			pDestSPU		 = (uint8*)__spu_cache_lookup(curDestPPUEA, 1, 128);
			remDestCount = scSPUCacheLineSize - (curDestPPUEA & scSPUCacheLineSizeMask);
			//iterate til the end of the current src cache line
			for(unsigned int i=0; i<remSrcCount; ++i)
				*pDestSPU++ = *pSrcSPU++;
			remSize -= remSrcCount;
			curSrcPPUEA  += remSrcCount;
			curDestPPUEA += remSrcCount;
			remDestCount -= remSrcCount;
			remSrcCount = scSPUCacheLineSize;
		}
	}
	return pDest;
}

#endif //__SPU__
#endif //PS3
