/*
	relocatable and on demand linkable version of Memcpy__VML
*/ 

#if defined(PS3)
#if defined(__SPU__)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"

//non overlapping memcpy for local -> main
//	use cache where pDest is present, use DMA without cache where not
//	dma sync at the end is required
void* Memcpy__VML(void* const pDest, const void* const cpSrc, const uint32 cSize)
{
	spu_CheckCacheHazard((uint32)cpSrc < 256 * 1024 - cSize);
	spu_CheckCacheHazard((uint32)pDest >= 256 * 1024);
	spu_CheckCacheHazard(((uint32)cpSrc & 0xF) == 0);//assume 16 byte alignment for speed

	bool dmaActive = false;

	const uint32 cFirstLine = (uint32)pDest & ~scSPUCacheLineSizeMask;//first line
	const uint32 cStartOff	= (uint32)pDest & scSPUCacheLineSizeMask;//offset into 1st line
	const uint32 cEndAddr		= (uint32)pDest + cSize;
	const uint32 cLastLine	= cEndAddr & ~scSPUCacheLineSizeMask;//last line
	const uint32 cEndCount	= cEndAddr & scSPUCacheLineSizeMask;//number of bytes to set for last line

	uint8 tempBuf[scSPUCacheLineSize] _ALIGN(128);
	bool tempBufUsed = false;

	uint32 startFullAddr		= cFirstLine;

	uint8* pCurSrc				  = (uint8*)cpSrc;
	const bool cDestAligned16 = (((uint32)cpSrc & 0xF) == 0) && (((uint32)pDest & 0xF) == 0);

	const vec_uint4 cSplatAmount	= spu_splats((uint32)scSPUCacheLineSize);

	const bool cExecLastLineCode  = 
		cEndCount && ((cFirstLine != cLastLine) || ((cFirstLine == cLastLine) && (cStartOff == 0)));

	const uint32 g_SPUNumSets = *(uint32* __restrict)G_SPU_NUM_SETS;

#if defined(DO_SPU_PROFILING)
	g_ProfID = (vec_uint4)PROF_ID_MEMCPY_VML;
#endif

	//copy til next cache line start
	if(cStartOff != 0)
	{
		//first check if 1st line is present in cache
		const vec_uint4 cAddrSplat = spu_splats(cFirstLine);
		const int cSet = GetCacheSetIndex(cFirstLine);		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, cAddrSplat));
		if(indexInSet < 0)
		{
			//not in cache, copy via dma if cpSrc has 16 byte alignment, cache it otherwise
			if(cDestAligned16)
			{
				NSPU::MemcpyMain((uint32)pDest, pCurSrc, scSPUCacheLineSize - cStartOff, g_MemCpyTag);
				dmaActive = true;
				pCurSrc += scSPUCacheLineSize - cStartOff;
			}
			else
			{
				uint8 *pFirstLineSPU = (uint8*)__spu_cache_lookup((uint32)pDest, 1, 128);
				for(int j=0; j<scSPUCacheLineSize - cStartOff; ++j)
					*pFirstLineSPU++ = *pCurSrc++;
			}
		}
		else
		{
			//in cache
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff   = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			uint8* pDestBuf8					= (uint8*)(&((uint8*)g_pSPUCache)[cLineStartOff + cStartOff]);
			const uint32 cBytesToCopy = scSPUCacheLineSize - cStartOff;
			if(cDestAligned16)
			{
				//aligned, copy vector wise
				vec_uint4* __restrict pDestBuf16				= (vec_uint4*)pDestBuf8;
				const vec_uint4* __restrict pSrcBuf16		= (vec_uint4*)pCurSrc;
				for(int i=0; i<(cBytesToCopy >> 4); ++i)
					*pDestBuf16++ = *pSrcBuf16++;
				pCurSrc = (uint8*)pSrcBuf16;
			}
			else
			{
				//not aligned, copy byte wise
				for(int i=0; i<cBytesToCopy; ++i)
					*pDestBuf8++ = *pCurSrc++;
			}
		}
		startFullAddr += scSPUCacheLineSize;
	}
	//copy the full cache lines, check always if in cache
	const uint32 cLineCount	= (cEndAddr - startFullAddr) >> scSPUCacheLineSizeShift;

	vec_uint4 addrSplat						= spu_splats(startFullAddr);
	//memcpy for lines between first and last
	for(int i=0; i<cLineCount; ++i)
	{
		//check if present in cache
		const int cSet = GetCacheSetIndex(spu_extract(addrSplat, 0));		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, addrSplat));
		if(indexInSet < 0)
		{
			//not present in cache, dma asynchronously
			if(cDestAligned16)
			{
				NSPU::MemcpyMain(spu_extract(addrSplat, 0), pCurSrc, scSPUCacheLineSize, g_MemCpyTag);
				dmaActive = true;
			}
			else
			{
				//use a temp to copy the local contents so that it is aligned properly
				if(tempBufUsed)
					NSPU::SyncMemory(g_MemCpyTempTag);
				uint8* pTempCurSrc = pCurSrc;
				for(int j=0; j<scSPUCacheLineSize; ++j)
					tempBuf[j] = *pTempCurSrc++;//copy local contents so that it becomes aligned
				NSPU::MemcpyMain(spu_extract(addrSplat, 0), tempBuf, scSPUCacheLineSize, g_MemCpyTempTag);
				tempBufUsed    = true;
			}
			pCurSrc += scSPUCacheLineSize;
		}
		else
		{
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff  = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			uint8* pDestBuf8 = (uint8*)(&((uint8*)g_pSPUCache)[cLineStartOff]);
			if(cDestAligned16)
			{
				vec_uint4 *const pDestBuf16 = (vec_uint4*)pDestBuf8;
				NSPU::NCache::CopyCacheLine(pDestBuf16, (vec_uint4*)pCurSrc);
				pCurSrc += scSPUCacheLineSize;
			}
			else
			{
				for(int j=0; j<scSPUCacheLineSize; ++j)
					*pDestBuf8++ = *pCurSrc++;
			}
		}
		addrSplat = addrSplat + cSplatAmount;//increment line address
	}

	//copy last line
	if(cExecLastLineCode)
	{
		//check if present in cache
		const int cSet = GetCacheSetIndex(cLastLine);		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, addrSplat));
		if(indexInSet < 0)
		{
			//not present in cache, dma asynchronously
			if((cEndCount & 0xF) == 0 && cDestAligned16)
			{
				NSPU::MemcpyMain(cLastLine, pCurSrc, cEndCount, g_MemCpyTag);
				dmaActive = true;
			}
			else
			{
				uint8 *pLastLineSPU = (uint8*)__spu_cache_lookup(cLastLine, 1, 128);
				for(int j=0; j<scSPUCacheLineSize - cStartOff; ++j)
					*pLastLineSPU++ = *pCurSrc++;
			}
		}
		else
		{
			//in cache
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff	= (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			uint8 *pDestBuf8				= (uint8*)(&((uint8*)g_pSPUCache)[cLineStartOff]);
			if(cDestAligned16)
			{
				//aligned, copy vector wise
				vec_uint4* __restrict pDestBuf16 = (vec_uint4*)pDestBuf8;
				vec_uint4* __restrict pSrcBuf16	 = (vec_uint4*)pCurSrc;
				for(int i=0; i<(cEndCount >> 4); ++i)
					*pDestBuf16++ = *pSrcBuf16++;
				//copy remaining stuff byte wise
				pDestBuf8 = (uint8*)pDestBuf16;
				pCurSrc		= (uint8*)pSrcBuf16;
				for(int i=0; i<cEndCount & 0xF; ++i)
					*pDestBuf8++ = *pCurSrc++;
			}
			else
			{
				//not aligned, copy byte wise
				for(int i=0; i<cEndCount; ++i)
					*pDestBuf8++ = *pCurSrc++;
			}
		}
	}

	IF(tempBufUsed, false)
		NSPU::SyncMemory(g_MemCpyTempTag);

	//synchronize transfer
	if(dmaActive)
		NSPU::SyncMemory(g_MemCpyTag);

	return pDest;
}

#endif //__SPU__
#endif //PS3
