/*
	relocatable and on demand linkable implementation for volatile cache semantics
*/ 

#if defined(PS3)
#if defined(__SPU__)

#if !defined(eCryModule)
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"

#define GET_SHADOW_BUFFER_LINE(ALIGNED_SPU_CACHE_ADDR)\
	(vec_uint4*)(void*)(*(uint32*)G_SPU_CACHE_SHADOW_CACHE + (ALIGNED_SPU_CACHE_ADDR - (uint32)g_pSPUCache))

#define USE_ATOMICS_FOR_RELOAD
//TODO: lookup always without prefetch

void ReloadCacheLine(const uint32 cSPUAddr, const uint32 cPPUAddr)
{
	assert(*(uint32* __restrict)G_SPU_NUM_SETS>0);
#if defined(SUPP_DABR)
	SDABR *const pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;
	const uint32 cCurDABRVal = *((uint32*)(void*)pDABR->lsAddr);
	if(pDABR->oldVal != cCurDABRVal)
	{
		snPause();
		pDABR->oldVal = cCurDABRVal;//update to continue
	}
#endif
	//reload existing cache line, merge with existing contents (with the written bits)
	//get PPU cacheline here
	vec_uint4 ppuCont[8] _ALIGN(128);
	SPUSyncAtomicDCache();
#if defined(USE_ATOMICS_FOR_RELOAD)
	mfc_prep((void*)ppuCont, cPPUAddr & ~127);
	mfc_getllar_again();
#else
	MemcpyLS((void*)ppuCont, cPPUAddr & ~127, 128, g_scMemCpyTempTag);
#endif
#if defined(DO_SPU_PROFILING)
	//increment miss and prefetch miss count for current profiling id
	*((uint32*)((uint8*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR + (spu_extract(g_ProfID, 0) << 2) + (MAX_PROF_ID << 2))) += 1;
	*((uint32*)((uint8*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR + (spu_extract(g_ProfID, 0) << 2) + (MAX_PROF_ID << 3))) += 1;
#endif
	//get address of cache contents
	const uint32 cSPUCacheAddr = cSPUAddr & ~127;
	//get pointer to cache back buffer contents
	vec_uint4* const __restrict pShadowLine = GET_SHADOW_BUFFER_LINE(cSPUCacheAddr);
	//generate difference vector to only update non written bits
	const vec_uint4 cShadowLine0 = pShadowLine[0];
	const vec_uint4 cShadowLine1 = pShadowLine[1];
	const vec_uint4 cShadowLine2 = pShadowLine[2];
	const vec_uint4 cShadowLine3 = pShadowLine[3];
	const vec_uint4 cShadowLine4 = pShadowLine[4];
	const vec_uint4 cShadowLine5 = pShadowLine[5];
	const vec_uint4 cShadowLine6 = pShadowLine[6];
	const vec_uint4 cShadowLine7 = pShadowLine[7];
	vec_uint4* const __restrict pCacheLine		 = (vec_uint4*)(void*)cSPUCacheAddr;
	const vec_uint4 cCurLine0 = pCacheLine[0];
	const vec_uint4 cCurLine1 = pCacheLine[1];
	const vec_uint4 cCurLine2 = pCacheLine[2];
	const vec_uint4 cCurLine3 = pCacheLine[3];
	const vec_uint4 cCurLine4 = pCacheLine[4];
	const vec_uint4 cCurLine5 = pCacheLine[5];
	const vec_uint4 cCurLine6 = pCacheLine[6];
	const vec_uint4 cCurLine7 = pCacheLine[7];
	const vec_uint4 cWriteBackMask0 = spu_xor(cCurLine0, cShadowLine0);
	const vec_uint4 cWriteBackMask1 = spu_xor(cCurLine1, cShadowLine1);
	const vec_uint4 cWriteBackMask2 = spu_xor(cCurLine2, cShadowLine2);
	const vec_uint4 cWriteBackMask3 = spu_xor(cCurLine3, cShadowLine3);
	const vec_uint4 cWriteBackMask4 = spu_xor(cCurLine4, cShadowLine4);
	const vec_uint4 cWriteBackMask5 = spu_xor(cCurLine5, cShadowLine5);
	const vec_uint4 cWriteBackMask6 = spu_xor(cCurLine6, cShadowLine6);
	const vec_uint4 cWriteBackMask7 = spu_xor(cCurLine7, cShadowLine7);
	//sync transfer
#if defined(USE_ATOMICS_FOR_RELOAD )
	mfc_read_atomic_status();
#else
	SyncMemory(g_scMemCpyTempTag);
#endif
	//merge written bits and update cache contents
	const vec_uint4 cNewLine0 = (vec_uint4)ppuCont[0];
	const vec_uint4 cNewLine1 = (vec_uint4)ppuCont[1];
	const vec_uint4 cNewLine2 = (vec_uint4)ppuCont[2];
	const vec_uint4 cNewLine3 = (vec_uint4)ppuCont[3];
	const vec_uint4 cNewLine4 = (vec_uint4)ppuCont[4];
	const vec_uint4 cNewLine5 = (vec_uint4)ppuCont[5];
	const vec_uint4 cNewLine6 = (vec_uint4)ppuCont[6];
	const vec_uint4 cNewLine7 = (vec_uint4)ppuCont[7];
	const vec_uint4 cMergedLine0 = spu_sel(cNewLine0, cCurLine0, cWriteBackMask0);
	const vec_uint4 cMergedLine1 = spu_sel(cNewLine1, cCurLine1, cWriteBackMask1);
	const vec_uint4 cMergedLine2 = spu_sel(cNewLine2, cCurLine2, cWriteBackMask2);
	const vec_uint4 cMergedLine3 = spu_sel(cNewLine3, cCurLine3, cWriteBackMask3);
	const vec_uint4 cMergedLine4 = spu_sel(cNewLine4, cCurLine4, cWriteBackMask4);
	const vec_uint4 cMergedLine5 = spu_sel(cNewLine5, cCurLine5, cWriteBackMask5);
	const vec_uint4 cMergedLine6 = spu_sel(cNewLine6, cCurLine6, cWriteBackMask6);
	const vec_uint4 cMergedLine7 = spu_sel(cNewLine7, cCurLine7, cWriteBackMask7);
	pCacheLine[0] = cMergedLine0;
	pCacheLine[1] = cMergedLine1;
	pCacheLine[2] = cMergedLine2;
	pCacheLine[3] = cMergedLine3;
	pCacheLine[4] = cMergedLine4;
	pCacheLine[5] = cMergedLine5;
	pCacheLine[6] = cMergedLine6;
	pCacheLine[7] = cMergedLine7;
	//update back buffer, restore contents so that the mask stays the same
	pShadowLine[0] = spu_sel(cMergedLine0, cShadowLine0, cWriteBackMask0);
	pShadowLine[1] = spu_sel(cMergedLine1, cShadowLine1, cWriteBackMask1);
	pShadowLine[2] = spu_sel(cMergedLine2, cShadowLine2, cWriteBackMask2);
	pShadowLine[3] = spu_sel(cMergedLine3, cShadowLine3, cWriteBackMask3);
	pShadowLine[4] = spu_sel(cMergedLine4, cShadowLine4, cWriteBackMask4);
	pShadowLine[5] = spu_sel(cMergedLine5, cShadowLine5, cWriteBackMask5);
	pShadowLine[6] = spu_sel(cMergedLine6, cShadowLine6, cWriteBackMask6);
	pShadowLine[7] = spu_sel(cMergedLine7, cShadowLine7, cWriteBackMask7);
}

void* DoVolatileCacheLookup(const uint32 cEA, const uint32 cLRUIncr, const int32 cPrefOff)
{
	//check if cache line is present
	spu_CheckCacheHazard(cEA > (uint32)256 * 1024);
	//4 way check of aligned address -> move ea into all 4 slots
	const vec_uint4 cEAAligned4	= spu_splats(cEA & ~scSPUCacheLineSizeMask);
	const uint32 g_SPUNumSets		= *(uint32* __restrict)G_SPU_NUM_SETS;
	assert(g_SPUNumSets > 0);
	const int cSet		= GetCacheSetIndex(cEA);					//get set index
	int indexInSet		= GetCacheIndexNum(SetCache4WayLookup(cSet, cEAAligned4));//lookup of all 4 entries if there is a hit
	IF(indexInSet < 0, true)
	{
		//not present, call cache lookup miss version, cache line is reloaded so no additional work required
		return __spu_cache_lookup_miss(cEA, cLRUIncr, 0);
	}
	else
	{
		indexInSet = scSPUCacheSetNumWaysMask - indexInSet;
		//present, reload cache line and update LRU
#if defined(DO_SPU_PROFILING)
		((uint32*)(void*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR)[spu_extract(g_ProfID,0)] += 1;
		++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->cacheHits;
#endif
		//generate existing SPU cache address
		const uint32 cLineStartOff	= (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
		const uint32 cSPUAddr				= (uint32)(&((uint8*)g_pSPUCache)[cLineStartOff + (cEA & scSPUCacheLineSizeMask)]);
		ReloadCacheLine(cSPUAddr, cEA);
/*		//update LRU value if not already matching current one
		if(spu_extract(g_pSPUCacheLRUCtrl[cSet], indexInSet) != spu_extract(g_LRUCounter, 0))
		{
			UpdateLRU(g_pSPUCacheLRUCtrl[cSet], indexInSet, spu_add(g_LRUCounter, cLRUIncr));
			g_LRUCounter = spu_add(g_LRUCounter, 1);//increment LRU counter
		}
*/
		return (void*)cSPUAddr;
	}
}

void WriteBackCacheLine(const uint32 cSPUAddr, const uint32 cPPUAddr)
{
	assert(*(uint32* __restrict)G_SPU_NUM_SETS>0);
#if defined(SUPP_DABR)
	SDABR *const pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;
	if(pDABR->ppuEA == cPPUAddr & ~127)
		snPause();
#endif
	//merge written bits and write back atomically
	vec_uint4 ppuCont[8] _ALIGN(128);
	SPUSyncAtomicDCache();
	mfc_prep((void*)ppuCont, cPPUAddr & ~127);
	mfc_getllar_again();
#if defined(DO_SPU_PROFILING)
	//increment miss and prefetch miss count for current profiling id
	*((uint32*)((uint8*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR + (spu_extract(g_ProfID, 0) << 2) + (MAX_PROF_ID << 2))) += 1;
	*((uint32*)((uint8*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR + (spu_extract(g_ProfID, 0) << 2) + (MAX_PROF_ID << 3))) += 1;
#endif
	//get address of cache contents
	const uint32 cSPUCacheAddr = cSPUAddr & ~127;
	//get pointer to cache back buffer contents
	vec_uint4* const __restrict pShadowLine = GET_SHADOW_BUFFER_LINE(cSPUCacheAddr);
	//generate difference vector to only update non written bits
	const vec_uint4 cShadowLine0 = pShadowLine[0];
	const vec_uint4 cShadowLine1 = pShadowLine[1];
	const vec_uint4 cShadowLine2 = pShadowLine[2];
	const vec_uint4 cShadowLine3 = pShadowLine[3];
	const vec_uint4 cShadowLine4 = pShadowLine[4];
	const vec_uint4 cShadowLine5 = pShadowLine[5];
	const vec_uint4 cShadowLine6 = pShadowLine[6];
	const vec_uint4 cShadowLine7 = pShadowLine[7];
	vec_uint4* const __restrict pCacheLine	= (vec_uint4*)(void*)cSPUCacheAddr;
	const vec_uint4 cCurLine0 = pCacheLine[0];
	const vec_uint4 cCurLine1 = pCacheLine[1];
	const vec_uint4 cCurLine2 = pCacheLine[2];
	const vec_uint4 cCurLine3 = pCacheLine[3];
	const vec_uint4 cCurLine4 = pCacheLine[4];
	const vec_uint4 cCurLine5 = pCacheLine[5];
	const vec_uint4 cCurLine6 = pCacheLine[6];
	const vec_uint4 cCurLine7 = pCacheLine[7];
	const vec_uint4 cWriteBackMask0 = spu_xor(cCurLine0, cShadowLine0);
	const vec_uint4 cWriteBackMask1 = spu_xor(cCurLine1, cShadowLine1);
	const vec_uint4 cWriteBackMask2 = spu_xor(cCurLine2, cShadowLine2);
	const vec_uint4 cWriteBackMask3 = spu_xor(cCurLine3, cShadowLine3);
	const vec_uint4 cWriteBackMask4 = spu_xor(cCurLine4, cShadowLine4);
	const vec_uint4 cWriteBackMask5 = spu_xor(cCurLine5, cShadowLine5);
	const vec_uint4 cWriteBackMask6 = spu_xor(cCurLine6, cShadowLine6);
	const vec_uint4 cWriteBackMask7 = spu_xor(cCurLine7, cShadowLine7);
	int firstInvocation = 1;
	do
	{
		IF(!firstInvocation, false)
			mfc_getllar_again();//fetch from PPU again
		//sync atomic transfer
		mfc_read_atomic_status();
		//merge written bits and write back merged contents
		const vec_uint4 cNewLine0 = (vec_uint4)ppuCont[0];
		const vec_uint4 cNewLine1 = (vec_uint4)ppuCont[1];
		const vec_uint4 cNewLine2 = (vec_uint4)ppuCont[2];
		const vec_uint4 cNewLine3 = (vec_uint4)ppuCont[3];
		const vec_uint4 cNewLine4 = (vec_uint4)ppuCont[4];
		const vec_uint4 cNewLine5 = (vec_uint4)ppuCont[5];
		const vec_uint4 cNewLine6 = (vec_uint4)ppuCont[6];
		const vec_uint4 cNewLine7 = (vec_uint4)ppuCont[7];
		const vec_uint4 cMergedLine0 = spu_sel(cNewLine0, cCurLine0, cWriteBackMask0);
		const vec_uint4 cMergedLine1 = spu_sel(cNewLine1, cCurLine1, cWriteBackMask1);
		const vec_uint4 cMergedLine2 = spu_sel(cNewLine2, cCurLine2, cWriteBackMask2);
		const vec_uint4 cMergedLine3 = spu_sel(cNewLine3, cCurLine3, cWriteBackMask3);
		const vec_uint4 cMergedLine4 = spu_sel(cNewLine4, cCurLine4, cWriteBackMask4);
		const vec_uint4 cMergedLine5 = spu_sel(cNewLine5, cCurLine5, cWriteBackMask5);
		const vec_uint4 cMergedLine6 = spu_sel(cNewLine6, cCurLine6, cWriteBackMask6);
		const vec_uint4 cMergedLine7 = spu_sel(cNewLine7, cCurLine7, cWriteBackMask7);
		ppuCont[0] = cMergedLine0;
		ppuCont[1] = cMergedLine1;
		ppuCont[2] = cMergedLine2;
		ppuCont[3] = cMergedLine3;
		ppuCont[4] = cMergedLine4;
		ppuCont[5] = cMergedLine5;
		ppuCont[6] = cMergedLine6;
		ppuCont[7] = cMergedLine7;
		mfc_putllc_again();
		//update cache and shadow buffer to main cache contents
		pCacheLine[0] = cMergedLine0;
		pCacheLine[1] = cMergedLine1;
		pCacheLine[2] = cMergedLine2;
		pCacheLine[3] = cMergedLine3;
		pCacheLine[4] = cMergedLine4;
		pCacheLine[5] = cMergedLine5;
		pCacheLine[6] = cMergedLine6;
		pCacheLine[7] = cMergedLine7;
		pShadowLine[0] = cMergedLine0;
		pShadowLine[1] = cMergedLine1;
		pShadowLine[2] = cMergedLine2;
		pShadowLine[3] = cMergedLine3;
		pShadowLine[4] = cMergedLine4;
		pShadowLine[5] = cMergedLine5;
		pShadowLine[6] = cMergedLine6;
		pShadowLine[7] = cMergedLine7;
		firstInvocation = 0;
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
};

#endif //__SPU__
#endif //PS3
