/*
	relocatable and on demand linkable version of Memcpy__VLM
*/ 

#if defined(PS3)
#if defined(__SPU__)

#ifndef eCryModule
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"

//non overlapping memcpy for main -> local
//	idea: iterate over main memory and use cache where cpSrc is present, use DMA without cache where not
void* Memcpy__VLM(void* const pDest, const void* const cpSrc, const uint32 cSize)
{
	IF(cSize == 0, 0)
		return pDest;
	assert(*(uint32* __restrict)G_SPU_NUM_SETS>0);
	spu_CheckCacheHazard((uint32)cpSrc >= 256 * 1024);
	spu_CheckCacheHazard((uint32)pDest < 256 * 1024 - cSize);

	bool dmaActive = false;
	const uint32 cFirstLine = (uint32)cpSrc & ~scSPUCacheLineSizeMask;//first line
	const uint32 cStartOff	= (uint32)cpSrc & scSPUCacheLineSizeMask;//offset into 1st line
	const uint32 cEndAddr		= (uint32)cpSrc + cSize;
	const uint32 cLastLine	= cEndAddr & ~scSPUCacheLineSizeMask;//last line
	const uint32 cEndCount	= cEndAddr & scSPUCacheLineSizeMask;//number of bytes to set for last line

	uint8* pCurDest = (uint8*)pDest;

	uint8 tempBufLastLine[scSPUCacheLineSize] _ALIGN(128);
	bool tempBufferLastLineUsed = false;
	uint8* pLastLineDest;

	uint8 tempBuf[scSPUCacheLineSize] _ALIGN(128);
	bool tempBufUsed = false;
	uint32 curTempBufOff  = 0;
	uint8* pCurTempDest   = pCurDest;

	uint32 startFullAddr = cFirstLine;

	const bool cSrcAligned16 = (((uint32)cpSrc & 0xF) == 0) && (((uint32)pDest & 0xF) == 0);

	const vec_uint4 cSplatAmount	= spu_splats((uint32)scSPUCacheLineSize);
	const bool cExecLastLineCode  = 
		cEndCount && ((cFirstLine != cLastLine) || ((cFirstLine == cLastLine) && (cStartOff == 0)));

	const uint32 g_SPUNumSets = *(uint32* __restrict)G_SPU_NUM_SETS;

	//copy til next cache line start
	if(cStartOff != 0)
	{
		//first check if 1st line is present in cache
		const vec_uint4 cAddrSplat = spu_splats(cFirstLine);
		const int cSet = GetCacheSetIndex(cFirstLine);		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, cAddrSplat));
		uint32 bytesToCopy = scSPUCacheLineSize - cStartOff;
		bytesToCopy = (cSize > bytesToCopy)?bytesToCopy : cSize;
		IF(indexInSet < 0, true)
		{
			//not in cache, copy via dma, check if cpSrc has 16 byte alignment, copy to a temp array in this case
			if(cSrcAligned16 && ((bytesToCopy & 0xF) == 0))
			{
				//transfer min of size and remaining cache line bytes
				MemcpyLS(pDest, (uint32)cpSrc, bytesToCopy, g_scMemCpyTag);
				dmaActive = true;
			}
			else
			{
				//no 16 byte alignment, copy into temp buffer
				uint32 transferSize = (scSPUCacheLineSize - (cStartOff & ~15));
				transferSize = (transferSize + 0xF) & ~0xF;//align transfer
				MemcpyLS(tempBuf, (uint32)cpSrc & ~15, transferSize, g_scMemCpyTempTag);
				tempBufUsed = true;
				curTempBufOff  = (uint32)cpSrc & 15;
			}
		}
		else
		{
			//in cache
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff  = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			uint8* const pSrcBuf8 = (uint8*)(&((uint8*)g_pSPUCache)[cLineStartOff + cStartOff]);
			if(cSrcAligned16 && (bytesToCopy & 15) == 0)
			{
				//aligned, copy vector wise
				vec_uint4* __restrict pDestBuf	= (vec_uint4*)pDest;
				vec_uint4* __restrict pSrcBuf		= (vec_uint4*)pSrcBuf8;
				for(unsigned int i=0; i<(bytesToCopy >> 4); ++i)
					*pDestBuf++ = *pSrcBuf++;
			}
			else
			{
				//not aligned, copy byte wise
				uint8* __restrict pDestBuf = (uint8*)pDest;
				uint8* __restrict pSrcBuf	 = pSrcBuf8;
				for(unsigned int i=0; i<bytesToCopy; ++i)
					*pDestBuf++ = *pSrcBuf++;
			}
		}
		startFullAddr += scSPUCacheLineSize;
		pCurDest += scSPUCacheLineSize - cStartOff;
	}
	//copy the full cache lines, check always if in cache
	uint32 lineCount				= (cEndAddr - startFullAddr) >> scSPUCacheLineSizeShift;
	lineCount								= (cEndAddr > startFullAddr)?lineCount : 0;//if we have already transfered all we had to
	vec_uint4 addrSplat			= spu_splats(startFullAddr);
	//memcpy for lines between first and last
	for(unsigned int i=0; i<lineCount; ++i)
	{
		//check if present in cache
		const int cSet = GetCacheSetIndex(spu_extract(addrSplat, 0));		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, addrSplat));
		if(indexInSet < 0)
		{
			//not present in cache, dma asynchronously
			if(cSrcAligned16)
			{
				MemcpyLS(pCurDest, spu_extract(addrSplat, 0), scSPUCacheLineSize, g_scMemCpyTag);
				dmaActive = true;
			}
			else
			{
				//use a temp buffer again
				if(tempBufUsed)
				{
					uint8* pSrcBuf	= (uint8*)&tempBuf[curTempBufOff];
					SyncMemory(g_scMemCpyTempTag);
					const int cTempBufCount = scSPUCacheLineSize - curTempBufOff;
					for(unsigned int i=0; i<cTempBufCount; ++i)
						*pCurTempDest++ = *pSrcBuf++;
				}
				MemcpyLS(tempBuf, spu_extract(addrSplat, 0), scSPUCacheLineSize, g_scMemCpyTempTag);
				curTempBufOff  = 0;
				pCurTempDest   = pCurDest;
				tempBufUsed    = true;
			}
			pCurDest += scSPUCacheLineSize;
		}
		else
		{
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff  = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			uint8* pSrcBuf8 = (uint8*)(&((uint8*)g_pSPUCache)[cLineStartOff]);
			if(cSrcAligned16)
			{
				vec_uint4 *const pSrcVec = (vec_uint4*)pSrcBuf8;
				NSPU::NCache::CopyCacheLine((vec_uint4*)pCurDest, pSrcVec);
				pCurDest += scSPUCacheLineSize;
			}
			else
			{
				for(unsigned int j=0; j<scSPUCacheLineSize; ++j)
					*pCurDest++ = *pSrcBuf8++;
			}
		}
		addrSplat = addrSplat + cSplatAmount;//increment line address
	}
	//copy last line
	IF(cExecLastLineCode, false)
	{
		//check if present in cache
		const int cSet = GetCacheSetIndex(cLastLine);		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, addrSplat));
		IF(indexInSet < 0, true)
		{
			//not present in cache, dma asynchronously
			if((cEndCount & 0xF) == 0 && cSrcAligned16)
			{
				MemcpyLS(pCurDest, cLastLine, cEndCount, g_scMemCpyTag);
				dmaActive = true;
			}
			else
			{
				//copy into a special temp buffer
				MemcpyLS(tempBufLastLine, cLastLine, (cEndCount + 15) & ~15, g_scMemCpyTempTag1);
				pLastLineDest = pCurDest;
				tempBufferLastLineUsed = true;
			}
		}
		else
		{
			//in cache
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff  = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
			vec_uint4 *const pSrcVec = (vec_uint4*)(void*)(&((uint8*)g_pSPUCache)[cLineStartOff]);
			if(cSrcAligned16)
			{
				//aligned, copy vector wise
				vec_uint4* __restrict pDestBuf = (vec_uint4*)pCurDest;
				vec_uint4* __restrict pSrcBuf	= (vec_uint4*)pSrcVec;
				for(unsigned int i=0; i<(cEndCount >> 4); ++i)
					*pDestBuf++ = *pSrcBuf++;
				//copy remaining stuff byte wise
				pCurDest = (uint8*)pDestBuf;
				uint8* __restrict pCurSrcBuf = (uint8*)pSrcBuf;
				for(unsigned int i=0; i<(cEndCount & 0xF); ++i)
					*pCurDest++ = *pCurSrcBuf++;
			}
			else
			{
				//not aligned, copy byte wise
				uint8* __restrict pSrcBuf	= (uint8*)pSrcVec;
				for(unsigned int i=0; i<cEndCount; ++i)
					*pCurDest++ = *pSrcBuf++;
			}
		}
	}

	IF(tempBufUsed, false)
	{
		uint8* pSrcBuf	= (uint8*)&tempBuf[curTempBufOff];
		SyncMemory(g_scMemCpyTempTag);
		unsigned int tempBufCount = scSPUCacheLineSize - curTempBufOff;
		tempBufCount = (cSize > tempBufCount)?tempBufCount : cSize;
		for(unsigned int i=0; i<tempBufCount; ++i)
			*pCurTempDest++ = *pSrcBuf++;
	}

	//synchronize transfer
	if(dmaActive)
		SyncMemory(g_scMemCpyTag);

	IF(tempBufferLastLineUsed, false)
	{
		uint8* pCurSrc = tempBufLastLine;
		SyncMemory(g_scMemCpyTempTag1);
		for(unsigned int i=0; i<cEndCount; ++i)
			*pLastLineDest++ = *pCurSrc++;
	}
	return pDest;
}

#endif //__SPU__
#endif //PS3
