/*
	relocatable and on demand linkable version of Memset__VM
*/ 

#if defined(PS3)
#if defined(__SPU__)

#if !defined(eCryModule)
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"

//memset for main memory
//	use cache where pDest is present, use DMA without cache where not
//	dma sync at the end is required
//	for 1st and last 128 byte, the cache is used if it does not occupy the full line
void* Memset__VM(void* pDest, int cVal, const uint32 cSize)
{
	IF(cSize == 0, 0)
		return pDest;
	assert(*(uint32* __restrict)G_SPU_NUM_SETS>0);
	uint32 size = cSize;
	const uint8 cValChar = (uint8)cVal;
	//generate single cache line only containing cVal
	const vec_uchar16 cVal16 = spu_splats(cValChar);
	const uint32 cFirstLine = (uint32)pDest & ~scSPUCacheLineSizeMask;//first line
	const uint32 cStartOff	= (uint32)pDest & scSPUCacheLineSizeMask;//offset into 1st line
	const uint32 cEndAddr		= (uint32)pDest + size;
	const uint32 cLastLine	= cEndAddr & ~scSPUCacheLineSizeMask;//last line
	const uint32 cEndCount	= cEndAddr & scSPUCacheLineSizeMask;//number of bytes to set for last line
#if defined(DO_SPU_PROFILING)
	g_ProfID = (vec_uint4)PROF_ID_MEMSET_VM;
#endif
	//treat case where we operate fully within one Cache line special
	IF(cFirstLine == cLastLine, false)
	{
		uint8 *pFirstLineSPU = (uint8*)__spu_cache_lookup((uint32)pDest, 1, 128);
		while(size && ((uint32)pFirstLineSPU & 0xF))
		{
			//copy byte wise
			*pFirstLineSPU++ = cValChar;
			--size;
		}
		const uint8 *cpEndVec = (uint8*)((uint32)pFirstLineSPU + (size & ~15));
		//copy vector wise
		while(pFirstLineSPU != cpEndVec)
		{
			*(vec_uchar16*)pFirstLineSPU = cVal16;
			pFirstLineSPU += 16;
			size -= 16;
		}
		//copy end byte wise again
		while(size)
		{
			//copy byte wise
			*pFirstLineSPU++ = cValChar;
			--size;
		}
		return pDest;
	}

	vec_uchar16 cVal128[8] _ALIGN(128);
	cVal128[0] = cVal16;		cVal128[1] = cVal16;		cVal128[2] = cVal16;		cVal128[3] = cVal16;
	cVal128[4] = cVal16;		cVal128[5] = cVal16;		cVal128[6] = cVal16;		cVal128[7] = cVal16;

	const bool cFirstLineAligned = (cStartOff == 0);
	const bool cLastLineFullyOcc = (cEndCount == scSPUCacheLineSize);

	int lineCount = (int)((cLastLine - cFirstLine) >> scSPUCacheLineSizeShift)+1;
	const vec_uint4 cSplatAmount = spu_splats((uint32)scSPUCacheLineSize);
	vec_uint4 addrSplat = spu_splats(cFirstLine);

	bool dmaActive = false;

	//prefetch first and last line if required
	lineCount -= (cFirstLineAligned?0:1);
	addrSplat += (cFirstLineAligned?(vec_uint4){0} : cSplatAmount);//increment line address
	lineCount -= (cLastLineFullyOcc?0:1);

	const uint32 g_SPUNumSets = *(uint32* __restrict)G_SPU_NUM_SETS;

	//memset for lines between first and last
	for(unsigned int i=0; i<lineCount; ++i)
	{
		//check if present in cache
		const int cSet = GetCacheSetIndex(spu_extract(addrSplat, 0));		//get set index
		spu_CheckCacheHazard(cSet < g_SPUNumSets);
		//3 - because indices are reversed for speed  (only asm impl. matters anyway)
		int indexInSet = GetCacheIndexNum(SetCache4WayLookup(cSet, addrSplat));
		if(indexInSet < 0)
		{
			//not present in cache, dma asynchronously
			MemcpyMain(spu_extract(addrSplat, 0), (NSPU::TAddrLS)(&cVal128), scSPUCacheLineSize, g_scMemCpyTag);
			dmaActive = true;
		}
		else
		{
			indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
			const int cLineStartOff = (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << (scSPUCacheLineSizeShift-4));
			NSPU::NCache::CopyCacheLine(&g_pSPUCache[cLineStartOff], (vec_uint4*)cVal128);
		}
		addrSplat = addrSplat + cSplatAmount;//increment line address
	}

	IF(!cFirstLineAligned, false)
	{
		//lookup first line
		uint8 *pFirstLineSPU = (uint8*)__spu_cache_lookup((uint32)pDest, 1, 128);
		uint32 toCopy = scSPUCacheLineSize - cStartOff;
		while(toCopy && ((uint32)pFirstLineSPU & 0xF))
		{
			//copy byte wise
			*pFirstLineSPU++ = cValChar;
			--toCopy;
		}
		const uint32 cVecSetCount = toCopy >> 4;
		vec_uchar16 *const pFirstLineSPUVec16 = (vec_uchar16*)pFirstLineSPU;
		for(uint32 i=0; i<cVecSetCount; ++i)
			pFirstLineSPUVec16[i] = cVal16;
	}

	IF(!cLastLineFullyOcc, false)
	{
		//lookup last line
		uint8 *pLastLineSPU = (uint8*)__spu_cache_lookup(cLastLine, 1, 128);
		const uint32 cVecSetCount = cEndCount >> 4;
		vec_uchar16 *const pLastLineSPUVec16 = (vec_uchar16*)pLastLineSPU;
		for(uint32 i=0; i<cVecSetCount; ++i)
			pLastLineSPUVec16[i] = cVal16;
		pLastLineSPU = (uint8*)&pLastLineSPUVec16[cVecSetCount];
		uint32 toCopy = cEndCount & 0xF;
		while(toCopy)
		{
			//copy byte wise
			*pLastLineSPU++ = cValChar;
			--toCopy;
		}
	}

	//Invalidate prefetches
	vec_uint4 *const __restrict pPrefetchLRUDir = (vec_uint4*)G_SPU_CACHE_PREF_LRU_DIR_ADDR;
	vec_uint4 *const __restrict pPrefetchDir		= (vec_uint4*)G_SPU_CACHE_PREF_DIR_ADDR;
	*pPrefetchLRUDir = *pPrefetchDir = spu_splats((uint32)0);
	//synchronize transfer
	if(dmaActive)
	{
		__spu_flush_cache_range(pDest, cSize);
		SyncMemory(g_scMemCpyTag);
	}

	return pDest;
}

#endif //__SPU__
#endif //PS3
