/* 
	implementation of memory manager and cache miss handler 
	nothing is put into function sections so that all is included into the driver
	Cache Notes:
		only single cache lines are supported since the code generator does only support natural loads
		multiple cachelines would collide with LRU and cannot be implemented for all cases
		automatic DMA prefetching is enabled
			prefetched cachelines not currently present and not in async write back progress
		atomic write back is performed in 2 stages to make use of latencies between misses, the 2nd stage is toggled 
			in the asm epilogue
		simple asynchronous write back transfer is performed (faster) for areas specified by the program
			(calls to SPUAddCacheWriteRangeAsync), no DMA sync is performed
		to not let atomic write backs, prefetches or async transfers transfer old memory, the back transfer addresses 
			are always compared against the new cache line address (the last 4 and the present one),
			prefetches are nullified if equal, async transfers are syncd and atomic transfers are just copying the merged present contents

*/ 

#if defined(PS3)

#if defined(__SPU__)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <platform.h>
#include <cell/dma.h>
#include "SPUMemManager_spu.h"
#include "SPULoaderDefs.h"
#include <IJobManSPU.h>
#include "SPU.h"
#include "SPUUtilities.h"
#include "./Cache/Cache_spu.h"
#include "../PPU/SPUMemAreaMan.h"
#include "CodePage/SPUPages.h"
#include "CodePage/SPUPageLayout.h"
#include "../PPU/PPU.h"
#include <ILog.h>

namespace NSPU
{
	//ptr to global page directory(all pages with EA / size)(page ID = index starting at 0)
	SPageInfo *g_GlobalSPUPageDir _ALIGN(16);	
	//state of each page slot
	SPageState g_SPUPageStates[scMaxSPUPageCount] _ALIGN(16);
	vec_int4 g_SPUPageDir;							//ID/index for each page slot, duplicated from g_SPUPageStates
	vec_uint4 g_SPUPageLRUDir;					//LRU state for each page slot, reuses cache one
	vec_uint4 g_PageLRUCounter;					//LRU counter for pages
	uint8* __restrict g_SPUPageMem _ALIGN(16);	//ptr to page memory, set up for each job depending on page requirement
	vec_uint4 g_PageMemLower;						//lower address of the page memory slot, used in CodePagingCallMissHandler
	vec_uint4 g_PageMemUpper;						//upper address of the page memory slot, used in CodePagingReturnMissHandler
	uint32 g_SPUPageSize _ALIGN(16);		//max size of each page slot
	SReturnStackEntry g_ReturnStack[RETURN_STACK_MAX_ENTRIES] _ALIGN(16);//return stack
	SReturnStackEntry *__restrict g_pReturnStackTop _ALIGN(16);//ptr to current top entry into g_ReturnStack
	NSPU::SPageDirInfo g_PageInfo _ALIGN(16);//page info
		
	uint8 g_SPUAtomicBuf[128] _ALIGN(128) _DATASEC;//atomic buffer

	namespace NDriver
	{
#if defined(SUPP_DABR)
		extern SDABR g_sDABR;
#endif
		extern uint8 g_sLSBuffer[]; 
		extern SInfoBlock g_sInfoBlock;
		extern CSPUMemMan g_sMemMan;
		extern uint32 g_DestMemAreaEA;
#if defined(SUPP_PRINTF)
		extern uint32 g_DestPrintfAreaEA;
#endif
#if defined(DO_SPU_PROFILING)
		extern SJobPerfStats g_PerfStats;
		extern uint32 g_DestProfAreaEA;
#endif
#if !defined(_NO_SPU_ASSERT)
		extern uint32 g_FuncTableEntryCount;
#endif
	}
}

#if defined(MFC_SYNC_BY_POLLING)
	#define MFC_SYNC(cTagMask, ea)\
	{\
		volatile int counter = 0;\
		do \
		{\
			++counter;\
			if(counter > 4000000)\
			{\
				const uint32 cTagID = 31 - spu_extract(spu_cntlz(spu_promote(cTagMask, 0)), 0);\
				printf("DMA-Timeout in SPUMemManager for tag=%d  job ID=%d\n", cTagID, NSPU::NDriver::g_sInfoBlock.jobId);\
				if(ea != 0xFFFFFFFF)printf("   ea=0x%08x\n", ea);\
				SPU_DEBUG_HALT;\
			}\
			si_wrch(MFC_WrTagUpdate,si_from_uint(MFC_TAG_UPDATE_IMMEDIATE));\
		}WHILE(si_to_uint(si_rdch(MFC_RdTagStat)) != cTagMask, false);\
	}
#else
	#define MFC_SYNC(tagMask, ea) {spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); spu_readch(MFC_RdTagStat);}
#endif

//---------------------------------------------code paging------------------------------------------------
namespace NSPU
{
	namespace NCodePaging
	{
#if defined(CHECK_PAGE_HAZARD)
		//hazard function jumped into from asm - miss handlers if stack pointer corrupts
		void PageHazard(const char* cpHazardCause)
		{
			printf("Encountered page hazard in %s\n",cpHazardCause);
			SPU_DEBUG_HALT;
		}
#endif

		//retrieve current page using $lr
		int GetCurPageID()
		{
			vec_uint4 curLR;
			asm volatile("ori %0, $lr, 0" : "=r"(curLR) : : );//copy link register
			const vec_uint4 cLR4							= spu_splats(spu_extract(curLR, 0));
			const vec_uint4 cLowerAddrCmpRes	= spu_cmpgt(cLR4, g_PageMemLower);
			const vec_uint4 cUpperAddrCmpRes	= spu_cmpgt(g_PageMemUpper, cLR4);
			const vec_uint4 cCurPageMask			= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes);
			return (int)spu_extract(spu_orx(spu_and(cCurPageMask, (vec_uint4)(qword)g_SPUPageDir)), 0);
		}

		//retrieve current page using $lr
		uint32 GetCurPageAddr()
		{
			vec_uint4 curLR;
			asm volatile("ori %0, $lr, 0" : "=r"(curLR) : : );//copy link register
			const vec_uint4 cLR4							= spu_splats(spu_extract(curLR, 0));
			const vec_uint4 cPageMemLower			= g_PageMemLower;
			const vec_uint4 cLowerAddrCmpRes	= spu_cmpgt(cLR4, cPageMemLower);
			const vec_uint4 cUpperAddrCmpRes	= spu_cmpgt(g_PageMemUpper, cLR4);
			const vec_uint4 cCurPageMask			= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes);
			return spu_extract(spu_orx(spu_and(cCurPageMask, (vec_uint4)(qword)cPageMemLower)), 0);
		}

		const int ResolveGlobalVarAddr(const int)
		{
			//g_CrossPageData contains the data, parameter just acts as placeholder for the lqr instruction
			return ((int)g_PageInfo.globalVarBaseAddr + spu_extract((vec_int4)((qword)g_CrossPageData), 0));
		}
	}//NCodePaging
} //SPU
//---------------------------------------------software cache------------------------------------------------

#if !defined(_NO_SPU_ASSERT)
	//hazard function jumped into from asm - miss handlers if stack pointer corrupts
	void StackAssertFunc()
	{
		printf("Stack check failed (<%d bytes left)\n",STACK_WARNING_VAL);
	}
#endif

	const vec_ushort8 GenFuncPtrFromId(const uint32 cFuncID)
	{
		const unsigned int cFuncTableBaseAddr = spu_extract(g_SetMaskSL4, 3);
		uint32 funcEntry = ((unsigned int*)cFuncTableBaseAddr)[cFuncID];
#if !defined(_NO_SPU_ASSERT)
		assert(cFuncTableBaseAddr > 0 && cFuncTableBaseAddr < 256 * 1024);
		if(cFuncID >= NSPU::NDriver::g_FuncTableEntryCount)
		{
			printf("Func.ptr entry ID (%d) invalid, max index: %d\n",cFuncID, NSPU::NDriver::g_FuncTableEntryCount);
	#if defined(SUPP_SN)
			__asm volatile ("stop 255");
	#endif
		}
		if(funcEntry == 0xFFFFFFFF)
		{
			printf("Func.ptr entry invalid for FuncID=%d\n",cFuncID);
	#if defined(SUPP_SN)
			__asm volatile ("stop 255");
	#endif
		}
#endif
		vec_uint4 curLR;
		asm volatile("ori %0, $lr, 0" : "=r"(curLR) : : );//copy link register
		const vec_uint4 cLR4							= spu_splats(spu_extract(curLR, 0));
		const vec_uint4 cLowerAddrCmpRes	= spu_cmpgt(cLR4, NSPU::g_PageMemLower);
		const vec_uint4 cUpperAddrCmpRes	= spu_cmpgt(NSPU::g_PageMemUpper, cLR4);
		const vec_uint4 cCurPageMask				= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes);
		const uint32 cCurID = (uint32)spu_extract(spu_orx(spu_and(cCurPageMask, (vec_uint4)(qword)NSPU::g_SPUPageDir)), 0);
		vec_ushort8 vecData;
		vecData = spu_insert(cCurID, vecData, 1);
		const uint32 cFuncEntryMasked = (funcEntry & 0x0000FFFF);
		funcEntry |= (cFuncEntryMasked == cCurID)?0x8000 : 0;//mark as weak
		vecData = spu_insert((unsigned short)cFuncEntryMasked, vecData, 4);//set destPageID
		vecData = spu_insert((unsigned short)((funcEntry & 0xFFFF0000) >> 16), vecData, 3);//set destOffset, keep stored as multiple of 4
		return vecData;
	}

extern "C"
{
	extern void* DoLookupCache(const uint32, const uint32, const uint32, const uint32, const uint32, const uint32);
	extern void* DoCacheMiss(const uint32, const uint32, const uint32, const uint32, const uint32, const uint32);
};

#if defined(DO_SPU_PROFILING)
	void SPUProfDataHandler();
#endif

namespace NSPU
{
#if !defined(_NO_SPU_ASSERT)
	extern uint32 g_sProgramTopLS;//program top address
#endif

	namespace NCache
	{
		//is also shared with Cache line buffer atomic Usage on SPUDriver
		#define g_scPreWriteArea				((vec_uint4*)&NSPU::NDriver::g_sLSBuffer[0]) 

		//current number of sets in cache
		uint32 g_SPUNumSets _ALIGN(16);

		//pointer to shadow buffer (saved register since it is only accessed here)
		vec_uint4* __restrict g_pSPUShadowCache _ALIGN(16);

		//4 cache line buffer where to transfer asynchronous back from
		vec_uint4 g_SPUAsyncCacheLine[4*scSPUCacheLineSize/sizeof(vec_uint4)] _ALIGN(128) _DATASEC;//force it to go into the data section
		//current tag for async transfers DMA is issued with (4 buffer) (0..3)
		//also temporarily used to indicate if there are any transfers to sync in SyncCache
		uint32 g_CurSPUAsyncTag _ALIGN(16);
		vec_uint4 g_SPUAsyncDir;//current EA of cachelines written back async 

		uint32 g_CurAtomicEA _ALIGN(16);//current EA an atomic transfer is in progress, 0 if none
#if defined(ENABLE_HAZARD_MODE)
		uint32 g_AtomicEAToStart _ALIGN(16);//1 if there is an atomic write back to be started
#endif
#if defined(DO_SPU_PROFILING)
		vec_uint4 g_SPUCacheHitIncr;				//scalar increment for asm-version of DoCacheLookup
		//lower MAX_PROF_ID * 4 byte are to track numbers of cache lookups, next MAX_PROF_ID * 4 byte are to track numbers of cache misses
		//	last MAX_PROF_ID * 4 byte are to track numbers of DMA prefetch misses
		//gets transferred back in FlushCacheComplete, so do not call it several times
		vec_uint4 g_SPUCacheProfIDCounter[MAX_PROF_ID * (4*3) / sizeof(vec_uint4)];//4 byte num of misses/hits/prefetch misses per id
		vec_uint4 g_SPUCacheCurProfID;//current profiling id (required to track misses)
#endif
 
		//prefetch data area
		uint8 g_pPrefetchBuffer[scSPUCacheSetNumWays * scSPUCacheLineSize] _ALIGN(128) _DATASEC;
		//prefetch dir and LRU
		vec_uint4 g_PrefetchLRUDir;//prefetch LRU dir 
		vec_uint4 g_PrefetchDir;//prefetch address dir
		//last miss transfered back EA, used to verify prefetch entry in post-asm
		//registers can be overwritten, just used to transfer data from miss handler into post handler asm code
		//one way: miss handler->cache lookup, can be reused somewhere, is used as dummy instruction outside too
#if !defined(SPU_CACHE_MISS_USE_ASM)
		register vec_uint4 volatile g_PrefetchLRUDirReg __asm__ ("$69");//register copy of g_PrefetchLRUDir for post asm code
		register vec_uint4 volatile g_PrefetchDirReg __asm__ ("$68"); //register copy of g_PrefetchDir for post asm code
	#if !defined(ENABLE_HAZARD_MODE)
		#define g_AtomicEAToStartReg g_ProfID	//		register vec_uint4 volatile g_AtomicEAToStartReg __asm__ ("$70");//1 if there is an atomic write back to be started
	#endif 
#endif	//SPU_CACHE_MISS_USE_ASM

		//modified via SPUJob.h and SPUMemManager_spu.h
		vec_uint4 g_AsyncRangesDirFrom;//async write back from ranges
		vec_uint4 g_AsyncRangesDirTo;//async write back to ranges

		//add 8 to the LRU to make explicit prefetches last longer than automatic ones
		void DmaPref(const uint32 cEA)
		{
			spu_CheckCacheHazard(cEA > (uint32)256 * 1024);
			const uint32 cAllocatedEA										= cEA & ~scSPUCacheLineSizeMask;
			const vec_uint4 cAllocatedEA4								= spu_splats(cAllocatedEA);
			const vec_uint4 cCurPrefLRUDirCont					= g_PrefetchLRUDir;
			IF(spu_extract(spu_gather(spu_and(g_PrefetchDir, cAllocatedEA4)),0) != 0, 0)//already present
				return;
			const uint32 cLRUReplIndex									= GetReplIndex(cCurPrefLRUDirCont);
			const vec_uint4 cLRUReplMask								= spu_insert(0xFFFFFFFF, (const vec_uint4){0}, cLRUReplIndex);
			si_wrch(MFC_LSA,si_from_uint((uint32)g_pPrefetchBuffer + (cLRUReplIndex << scSPUCacheLineSizeShift)));
			//		si_wrch(MFC_EAH,si_from_uint((uint64)(cEA)>>32));//this should not be necessary at all, but it becomes very slow otherwise
			g_PrefetchLRUDir														= spu_sel(cCurPrefLRUDirCont, spu_add(g_LRUCounter, 8), cLRUReplMask);
			si_wrch(MFC_EAL,si_from_uint(cAllocatedEA));
			g_PrefetchDir																= spu_sel(g_PrefetchDir, cAllocatedEA4, cLRUReplMask);
			si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
			si_wrch(MFC_TagID,si_from_uint(GetPrefetchTagID(cLRUReplIndex)));
			si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD));//toggle prefetch
		}

		//checks if an address is within one of the async write back ranges
		//keep in sync with FlushCacheRange - impl.
		SPU_DRIVER_INLINE
		const bool CheckWriteBackAsync(const vec_uint4 cEASplat)
		{
			//true is returned if any element of g_AsyncRangesDir satisfies:
			//	!(g_AsyncRangesDirFrom  > ea) && 
			//check if greater equal than g_AsyncRangesDirFrom and less than g_AsyncRangesDirTo
			const vec_uint4 cFromCmpRes  = spu_cmpgt(g_AsyncRangesDirFrom, cEASplat);//elems with 0 further to check
			const vec_uint4 cToCmpRes		 = spu_cmpgt(g_AsyncRangesDirTo, cEASplat);//elems with 1 further to check
			//if any element is 0 in cFromCmpRes and 1 at the same time in g_AsyncRangesDirTo, return true
			const vec_uint4 cFinalCmpRes = spu_andc(cToCmpRes, cFromCmpRes);//if any is 1, return true
			return (spu_extract(spu_orx(cFinalCmpRes), 0) != 0);
		}

		//returns true if a cache line is not dirty and does not need to be merged with main memory atomically
		//forced because used only once
		//keep in sync with FlushCacheRange - impl.
		SPU_DRIVER_INLINE
		const bool IsCacheLineUnchanged(const uint32 cCacheEntry, const vec_uint4 cEA4)
		{
#if defined(NO_WRITE_BACK)
			return true;
#endif
			//generate write back mask and check if contents has really been altered (compare with shadowed area)
			//g_scWriteBackMask contains for each bit 1 if altered, 0 otherwise
			const vec_uint4* const __restrict cpCurLine		 = &g_pSPUCache[cCacheEntry];
			const vec_uint4* const __restrict cpShadowLine = &g_pSPUShadowCache[cCacheEntry];
			vec_uint4* const __restrict pWriteBackMask		 = g_scWriteBackMask;
			//since the backend currently does not reorder loads/stores, we need to explicitly store everything into a register first
			const vec_uint4 cCurLine0 = cpCurLine[0];
			const vec_uint4 cCurLine1 = cpCurLine[1];
			const vec_uint4 cCurLine2 = cpCurLine[2];
			const vec_uint4 cCurLine3 = cpCurLine[3];
			const vec_uint4 cCurLine4 = cpCurLine[4];
			const vec_uint4 cCurLine5 = cpCurLine[5];
			const vec_uint4 cCurLine6 = cpCurLine[6];
			const vec_uint4 cCurLine7 = cpCurLine[7];

			const vec_uint4 cShadowLine0 = cpShadowLine[0];
			const vec_uint4 cShadowLine1 = cpShadowLine[1];
			const vec_uint4 cShadowLine2 = cpShadowLine[2];
			const vec_uint4 cShadowLine3 = cpShadowLine[3];
			const vec_uint4 cShadowLine4 = cpShadowLine[4];
			const vec_uint4 cShadowLine5 = cpShadowLine[5];
			const vec_uint4 cShadowLine6 = cpShadowLine[6];
			const vec_uint4 cShadowLine7 = cpShadowLine[7];

			pWriteBackMask[0] = spu_xor(cCurLine0, cShadowLine0);
			pWriteBackMask[1] = spu_xor(cCurLine1, cShadowLine1);
			pWriteBackMask[2] = spu_xor(cCurLine2, cShadowLine2);
			pWriteBackMask[3] = spu_xor(cCurLine3, cShadowLine3);
			vec_uint4 diffVec = spu_or(pWriteBackMask[0], pWriteBackMask[1]);
			pWriteBackMask[4] = spu_xor(cCurLine4, cShadowLine4);
			diffVec = spu_or(diffVec, pWriteBackMask[2]);
			pWriteBackMask[5] = spu_xor(cCurLine5, cShadowLine5);
			diffVec = spu_or(diffVec, pWriteBackMask[2]);
			pWriteBackMask[6] = spu_xor(cCurLine6, cShadowLine6);
			diffVec = spu_or(diffVec, pWriteBackMask[3]);
			pWriteBackMask[7] = spu_xor(cCurLine7, cShadowLine7);
			diffVec = spu_or(diffVec, pWriteBackMask[4]);
			diffVec = spu_or(diffVec, pWriteBackMask[5]);
			diffVec = spu_or(diffVec, pWriteBackMask[6]);
			diffVec = spu_or(diffVec, pWriteBackMask[7]);
			//if cEA != 0 and contents has not been changed, do not write back
			return (spu_extract(spu_andc(spu_orx(diffVec), spu_cmpeq(cEA4, (vec_uint4){0})), 0) == 0);
		}

		void DoStartAtomicWriteBack()
		{
			vec_uint4* const __restrict pWriteBackArea						 = g_scWriteBackArea;
			const vec_uint4* const __restrict cpWriteBackSavedArea = g_scWriteBackSavedArea;
			const vec_uint4* const __restrict cpWriteBackMask			 = g_scWriteBackMaskAtomic;

			//interleave main memory and SPU write back contents
			//remember: for each bit = 1 in g_scWriteBackMask means content change -> use g_scWriteBackSavedArea
			const vec_uint4 cWriteBackMasks0 = cpWriteBackMask[0];
			const vec_uint4 cWriteBackMasks1 = cpWriteBackMask[1];
			const vec_uint4 cWriteBackMasks2 = cpWriteBackMask[2];
			const vec_uint4 cWriteBackMasks3 = cpWriteBackMask[3];
			const vec_uint4 cWriteBackMasks4 = cpWriteBackMask[4];
			const vec_uint4 cWriteBackMasks5 = cpWriteBackMask[5];
			const vec_uint4 cWriteBackMasks6 = cpWriteBackMask[6];
			const vec_uint4 cWriteBackMasks7 = cpWriteBackMask[7];

			const vec_uint4 cWriteBackSavedAreas0 = cpWriteBackSavedArea[0];
			const vec_uint4 cWriteBackSavedAreas1 = cpWriteBackSavedArea[1];
			const vec_uint4 cWriteBackSavedAreas2 = cpWriteBackSavedArea[2];
			const vec_uint4 cWriteBackSavedAreas3 = cpWriteBackSavedArea[3];
			const vec_uint4 cWriteBackSavedAreas4 = cpWriteBackSavedArea[4];
			const vec_uint4 cWriteBackSavedAreas5 = cpWriteBackSavedArea[5];
			const vec_uint4 cWriteBackSavedAreas6 = cpWriteBackSavedArea[6];
			const vec_uint4 cWriteBackSavedAreas7 = cpWriteBackSavedArea[7];
#if defined(ENABLE_HAZARD_MODE)
			g_AtomicEAToStart	= 0;//reset
#endif
			mfc_read_atomic_status();//sync for pWriteBackArea

			const vec_uint4 cWriteBackAreas0 = pWriteBackArea[0];
			const vec_uint4 cWriteBackAreas1 = pWriteBackArea[1];
			const vec_uint4 cWriteBackAreas2 = pWriteBackArea[2];
			const vec_uint4 cWriteBackAreas3 = pWriteBackArea[3];
			const vec_uint4 cWriteBackAreas4 = pWriteBackArea[4];
			const vec_uint4 cWriteBackAreas5 = pWriteBackArea[5];
			const vec_uint4 cWriteBackAreas6 = pWriteBackArea[6];
			const vec_uint4 cWriteBackAreas7 = pWriteBackArea[7];

			const vec_uint4 cSelResults0 = spu_sel(cWriteBackAreas0, cWriteBackSavedAreas0, cWriteBackMasks0);
			const vec_uint4 cSelResults1 = spu_sel(cWriteBackAreas1, cWriteBackSavedAreas1, cWriteBackMasks1);
			const vec_uint4 cSelResults2 = spu_sel(cWriteBackAreas2, cWriteBackSavedAreas2, cWriteBackMasks2);
			const vec_uint4 cSelResults3 = spu_sel(cWriteBackAreas3, cWriteBackSavedAreas3, cWriteBackMasks3);
			const vec_uint4 cSelResults4 = spu_sel(cWriteBackAreas4, cWriteBackSavedAreas4, cWriteBackMasks4);
			const vec_uint4 cSelResults5 = spu_sel(cWriteBackAreas5, cWriteBackSavedAreas5, cWriteBackMasks5);
			const vec_uint4 cSelResults6 = spu_sel(cWriteBackAreas6, cWriteBackSavedAreas6, cWriteBackMasks6);
			const vec_uint4 cSelResults7 = spu_sel(cWriteBackAreas7, cWriteBackSavedAreas7, cWriteBackMasks7);

			pWriteBackArea[0] = cSelResults0;
			pWriteBackArea[1] = cSelResults1;
			pWriteBackArea[2] = cSelResults2;
			pWriteBackArea[3] = cSelResults3;
			pWriteBackArea[4] = cSelResults4;
			pWriteBackArea[5] = cSelResults5;
			pWriteBackArea[6] = cSelResults6;
			pWriteBackArea[7] = cSelResults7;
#if defined(DO_SPU_PROFILING)
			NSPU::NDriver::g_PerfStats.memTransFromLS += 128;
#endif
			mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0);//copy back interleaved cache contents
		}

		void SyncAtomicDCache()
		{
			if(g_CurAtomicEA != 0)
			{
				//first try to finish normally
				int status = mfc_read_atomic_status();
				IF(status != 0, false)
				{
					//we lost the cache line, perform in loop atomic synchronization
#if defined(DO_SPU_PROFILING)
					++NSPU::NDriver::g_PerfStats.lostLineEvents;
#endif
					//now write back in the usual loop
					mfc_prep(g_scWriteBackArea, g_CurAtomicEA);//copy current main memory contents here again
					do 
					{
#if defined(DO_SPU_PROFILING)
						NSPU::NDriver::g_PerfStats.memTransToLS += 128;
#endif
						mfc_getllar_again();
						DoStartAtomicWriteBack();
						status = mfc_read_atomic_status();
#if defined(DO_SPU_PROFILING)
						IF(status != 0, false)
							++NSPU::NDriver::g_PerfStats.lostLineEvents;
#endif
					}
					WHILE(status != 0, false);//update needs to be atomic
				}
				g_CurAtomicEA = 0;
			}
		}

#if !defined(SPU_CACHE_MISS_USE_ASM)
		SPU_DRIVER_INLINE
		vec_uint4* DoPrefetchLookup
		(
			const uint32 cEAAligned,
			uint32* pCacheSetmask, 
			uint32* pReplIndex, 
			const vec_uint4 cPrefetchCntlzRes,
			vec_uint4& rNextEA4,
			const vec_uint4 cCurLRUVal,
			vec_uint4& rReplMask,
			const vec_uint4 cPrefAsyncGatherRes,
			const int32 cPrefOff
		)
		{
			//we can have 4 prefetches in flight at a time
			//check if the address is in the prefetch buffer, if so, return the prefetch cache address and 
			//	set the corresponding cache set mask to sync on
			//if this has 32 leading zeros, we have a prefetch buffer miss, start transfer in this case
			//	and replace slot with the lowest LRU value
			//check if the new address matches any currently outgoing transfers, sync in this case
			vec_uint4 *pRetValue;

			rNextEA4 = spu_add(rNextEA4, cPrefOff);

			if(spu_extract(cPrefetchCntlzRes, 0) == 32)//prefer case where it is not present to initiate it faster if not
			{
#if defined(DO_SPU_PROFILING)
				//increment prefetch miss count for current profiling id
				*((uint32*)((uint8*)g_SPUCacheProfIDCounter + (spu_extract(g_SPUCacheCurProfID, 0) << 2) + (MAX_PROF_ID << 3))) += 1;
#endif
				//copy only if the new line is not the one currently atomically written back
				if(cEAAligned != g_CurAtomicEA)//if true, it is transferred into g_scPreWriteArea later
				{
					//compare if any of the current asynchronous transfers match the new cache line, sync in this case
					//cPrefAsyncGatherRes = spu_gather(spu_cmpeq(cEASplat4, g_SPUAsyncDir))
					IF(spu_extract(cPrefAsyncGatherRes, 0) != 0, false)
					{
						const uint32 cIndex		= spu_extract(spu_cntlz(cPrefAsyncGatherRes), 0);
						//sync against the corresponding DMA tag
						const uint32 cCurTagMask = (1<<(cIndex - 28));//async transfer tags
						spu_writech(MFC_WrTagMask, cCurTagMask);
						MFC_SYNC(cCurTagMask, 0xFFFFFFFF);
					}
					//this is the worst case scenario: the transfer of the memory missed was not initiated before
					//inlined:	mfc_get(g_scPreWriteArea, cEAAligned, scSPUCacheLineSize, g_scPrefMissTag, 0, 0);
					si_wrch(MFC_LSA,si_from_ptr(g_scPreWriteArea));
					//si_wrch(MFC_EAH,si_from_uint((uint64)(cEAAligned)>>32));
					si_wrch(MFC_EAL,si_from_uint(cEAAligned));
					si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
					si_wrch(MFC_TagID,si_from_uint(g_scPrefMissTag));
					si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));//initiate transfer
#if defined(DO_SPU_PROFILING)
					NSPU::NDriver::g_PerfStats.memTransToLS += scSPUCacheLineSize;
#endif
					//use time to sync memory transfer of pages if:
					//	job is not in single page mode
					//	any page is in state PAGE_STATE_STREAMING
					//	remaining transfer time is < 8*40 clock cycles (8 decrementer steps)
					const uint32 cDecrValTransCmp = spu_readch(SPU_RdDec);
					int i=0;
					int curTagMask = (1<<g_scDMAPageTag0);
					do
					{
						const bool cPageCheck = 
							(g_SPUPageStates[i].curState == PAGE_STATE_STREAMING) && 
							(cDecrValTransCmp <= g_SPUPageStates[i].transDecrEnd);
						if(cPageCheck)
						{
							g_SPUPageStates[i].curState = PAGE_STATE_READY;
							spu_writech(MFC_WrTagMask, curTagMask);
							spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
							spu_readch(MFC_RdTagStat);
						}
						curTagMask <<= 1;
					}
					while(++i < scMaxSPUPageCount);
				}
				else
				{
					//better sync here to avoid lots of code problems, case should very rarely happen
					SyncAtomicDCache();
					//copy merged data into main area, no sync is performed
					CopyCacheLine(g_scPreWriteArea, g_scWriteBackArea);
				}
				*pCacheSetmask  = (1 << g_scPrefMissTag);
				//get LRU value
				*pReplIndex = GetReplIndex(g_PrefetchLRUDir);

#if defined(DO_SPU_PROFILING)
				++NSPU::NDriver::g_PerfStats.prefetchMisses;
#endif
				rReplMask = spu_insert(0xFFFFFFFF, (const vec_uint4){0}, *pReplIndex);
				pRetValue = (vec_uint4*)g_scPreWriteArea;
			}
			else
			{
				//we have a prefetch hit
				const uint32 cIndex = spu_extract(spu_sub(cPrefetchCntlzRes, (vec_uint4){28}), 0);
	#if defined(DO_SPU_PROFILING)
				++NSPU::NDriver::g_PerfStats.prefetchHits;
	#endif
				rReplMask = spu_insert(0xFFFFFFFF, (const vec_uint4){0}, cIndex);//wraps around
				*pCacheSetmask = (1 << GetPrefetchTagID(cIndex));
				*pReplIndex		 = cIndex;
				pRetValue			 = (vec_uint4*)&g_pPrefetchBuffer[cIndex << scSPUCacheLineSizeShift];
			}

			rNextEA4 = spu_and(rNextEA4, ~scSPUCacheLineSizeMask);
			
			rReplMask						= cPrefOff?rReplMask:(vec_uint4){0};
			g_PrefetchLRUDirReg = spu_sel(g_PrefetchLRUDir, cCurLRUVal, rReplMask);//update LRU entry
			g_PrefetchDirReg	  = spu_sel(g_PrefetchDir, rNextEA4, rReplMask);//update dir entry
			g_PrefetchLRUDir		= g_PrefetchLRUDirReg;
			g_PrefetchDir				= g_PrefetchDirReg;
			return pRetValue;
		}

		SPU_DRIVER_INLINE
		void CopyCacheLineAndPrepareNextPrefetch
		(
			vec_uint4* const __restrict pDest,
			vec_uint4* const __restrict pDest1,
			const vec_uint4* const __restrict cpSrc,
			const vec_uint4 cNextEA4, 
			const uint32 cReplIndex
		)
		{
			const vec_uint4 cSrc0 = cpSrc[0];
			const vec_uint4 cSrc1 = cpSrc[1];
			const vec_uint4 cSrc2 = cpSrc[2];
			const vec_uint4 cSrc3 = cpSrc[3];
			const vec_uint4 cSrc4 = cpSrc[4];
			const vec_uint4 cSrc5 = cpSrc[5];
			const vec_uint4 cSrc6 = cpSrc[6];
			const vec_uint4 cSrc7 = cpSrc[7];
			//fenced transfer because we might overwrite an existing transfer still in progress
			si_wrch(MFC_LSA,si_from_ptr(&g_pPrefetchBuffer[cReplIndex << scSPUCacheLineSizeShift]));
			pDest[0]  = cSrc0;
//			si_wrch(MFC_EAH,si_from_uint((uint64)(spu_extract(cNextEA4, 0))>>32));
			pDest[1] = cSrc1;
			si_wrch(MFC_EAL,si_from_uint(spu_extract(cNextEA4, 0)));
			pDest[2] = cSrc2;
			si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
			pDest[3] = cSrc3;
			si_wrch(MFC_TagID,si_from_uint(GetPrefetchTagID(cReplIndex)));
			pDest[4] = cSrc4;
			si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD));//toggle prefetch
			pDest[5] = cSrc5;
			pDest[6] = cSrc6;
			pDest[7] = cSrc7;
			pDest1[0] = cSrc0;
			pDest1[1] = cSrc1;
			pDest1[2] = cSrc2;
			pDest1[3] = cSrc3;
			pDest1[4] = cSrc4;
			pDest1[5] = cSrc5;
			pDest1[6] = cSrc6;
			pDest1[7] = cSrc7;
		}
#endif //SPU_CACHE_MISS_USE_ASM

		SPU_DRIVER_INLINE
		void IncrementLRUCounter()
		{
			g_LRUCounter = spu_add(g_LRUCounter, 1);//only first component matters, but for performance do it this way
		}

#if !defined(SPU_CACHE_MISS_USE_ASM)
		//flushes one cache line using the write mask and MFC atomics
		//issue next transfer if there is one required (issued whilst input transfer is going on)
		//before issuing output transfer, it is checked if the contents has truly been changed
		//does not take care about dependencies
		SPU_DRIVER_INLINE
		void FlushSingleCacheLine(const uint32 cSet, const uint32 cIndexInSet)
		{
			//NOTE: g_AtomicEAToStartReg set to 0 in asm
			assert(cSet < g_SPUNumSets);
			const uint32 cEA					= GetCacheLineEA(g_pSPUCacheDir, cSet, cIndexInSet);
			const uint32 cCacheEntry	= ((cSet << scSPUCacheSetNumWaysShift) + cIndexInSet) << (scSPUCacheLineSizeShift-4);
			const vec_uint4 cEASplat4 = spu_splats(cEA);
			g_CurWrittenEA = (vec_uint4){0};
			//test for cEA != 0 is moved together with IsCacheLineUnchanged
			//determine if cache line is dirty (meaning that at least one bit has changed in the cache line)
			const bool cIsCacheLineUnChanged = IsCacheLineUnchanged(cCacheEntry, cEASplat4);
			const bool cTransferAsync = CheckWriteBackAsync(cEASplat4);
			//SyncAtomicDCache();//2nd step of atomic write back
	#if defined(PREF_NO_WRITE_BACK)
			if(cIsCacheLineUnChanged)
	#else
			IF(cIsCacheLineUnChanged, false)
	#endif
			{
	#if defined(DO_SPU_PROFILING)
				IF(cEA != 0, true)
					++NSPU::NDriver::g_PerfStats.cacheFlushsNoWrite;
	#endif
				return;
			}

	#if defined(SUPP_DABR)
			if(g_sDABR.ppuEA == cEA)
				snPause();
	#endif
			g_CurWrittenEA = cEASplat4;
			IF(cTransferAsync, false)
			{
				vec_uint4* const __restrict pCurAsyncCacheLine = &g_SPUAsyncCacheLine[g_CurSPUAsyncTag << (scSPUCacheLineSizeShift-4)];
				const uint32 cCurTag = g_CurSPUAsyncTag;
				g_CurSPUAsyncTag = (g_CurSPUAsyncTag + 1) & 3;//0..3
				//disable syncing for now since for each back transfer we need to transfer one cacheline here
				//syncing is only performed if the new address is found in g_SPUAsyncDir
				CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry]);
		#if defined(DO_SPU_PROFILING)
				NSPU::NDriver::g_PerfStats.memTransFromLS += scSPUCacheLineSize;
		#endif
				//inlined:	mfc_put(f)(pCurAsyncCacheLine, cEA, scSPUCacheLineSize, cCurTag, 0, 0);//start async write back
				si_wrch(MFC_LSA,si_from_ptr(pCurAsyncCacheLine));
				//si_wrch(MFC_EAH,si_from_uint((uint64)(cEA)>>32));
				si_wrch(MFC_EAL,si_from_uint(cEA));
				si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
				si_wrch(MFC_TagID,si_from_uint(cCurTag));
				si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//start asynchronous transfer back

				g_SPUAsyncDir			= spu_insert(cEA, g_SPUAsyncDir, cCurTag);//update dir

		#if defined(DO_SPU_PROFILING)
				++NSPU::NDriver::g_PerfStats.cacheWritesBackASync;
		#endif
				return;
			}
				//expansion of SyncAtomicDCache()
			IF(g_CurAtomicEA != 0, true)
			{
				//first try to finish normally
				int status = mfc_read_atomic_status();
				IF(status != 0, false)
				{
					//we lost the cache line, perform in loop atomic synchronization
	#if defined(DO_SPU_PROFILING)
					++NSPU::NDriver::g_PerfStats.lostLineEvents;
	#endif
					//now write back in the usual loop
					do 
					{
	#if defined(DO_SPU_PROFILING)
						NSPU::NDriver::g_PerfStats.memTransToLS += 128;
	#endif
						mfc_prep(g_scWriteBackArea, g_CurAtomicEA);//copy current main memory contents here again
						mfc_getllar_again();//copy current main memory contents here again
						DoStartAtomicWriteBack();
						status = mfc_read_atomic_status();
	#if defined(DO_SPU_PROFILING)
						IF(status != 0, false)
							++NSPU::NDriver::g_PerfStats.lostLineEvents;
	#endif
					}
					WHILE(status != 0, false);//update needs to be atomic
				}
			}
	#if defined(DO_SPU_PROFILING)
			NSPU::NDriver::g_PerfStats.memTransToLS += 128;
	#endif
			mfc_prep(g_scWriteBackArea, cEA);//copy current main memory contents here again
			mfc_getllar_again();
			CopyCacheLine((vec_uint4*)g_scWriteBackMaskAtomic, (vec_uint4*)g_scWriteBackMask);//copy write back mask
			g_CurAtomicEA	= cEA;
	#if defined(ENABLE_HAZARD_MODE) 
			g_AtomicEAToStart = 1;//flag active 
	#else
			g_AtomicEAToStartReg = (vec_uint4){1};//asm checks the value
	#endif
			//save current contents
			CopyCacheLine(g_scWriteBackSavedArea, &g_pSPUCache[cCacheEntry]);
	#if defined(DO_SPU_PROFILING)
			++NSPU::NDriver::g_PerfStats.cacheWritesBackSync;
	#endif
		}

		//cache miss handler, do not change Order of Parameters to have asm-impl. of DoCacheLookup_spu in sync
		//returns cache line offset in which it got placed
		//cSize must stay as 2nd parameter
		//cPrefetchCntlzRes is purely there for performance reason 
		//	(can be precomputed in asm version of DoCacheLookup_spu)
		const int SPUCacheMissHandler
		(
			const uint32 cEA, 
			const uint32 cSet,
			const vec_uint4 cPrefetchCntlzRes,
			const uint32 cReplIndex,
			const vec_uint4 cPrefAsyncGatherRes,
			const int32 cPrefOff 
		)
		{
			IF(cEA < 256*1024, false)
			{
#if !defined(ENABLE_HAZARD_MODE)
				IncrementLRUCounter();
#endif
				assert((unsigned int)(cEA)>256*1024);//assert to know when it happens (performance and code size penalty)
				return cEA;
			}
#if !defined(ENABLE_HAZARD_MODE) 
			g_AtomicEAToStartReg = (vec_uint4){0};
#endif

#if !defined(_NO_SPU_ASSERT)
			assert(GetStackAddress() > NSPU::g_sProgramTopLS + STACK_WARNING_VAL);
#endif

#if defined(DO_SPU_PROFILING)
			//increment miss count for current profiling id
			*((uint32*)((uint8*)g_SPUCacheProfIDCounter + (spu_extract(g_SPUCacheCurProfID, 0) << 2) + (MAX_PROF_ID << 2))) += 1;
#endif

#if defined(DO_SPU_PROFILING)
			const uint32 cMissHandlerStartTime = spu_readch(SPU_RdDec);
#endif

			const uint32 cEAAligned = cEA & ~scSPUCacheLineSizeMask;//cache line address

			//single cache line case
			//cReplIndex tells the index where to place new Cache Line within cSet
			//immediately initiate transfer to make full transfer time dependent on that
			vec_uint4 nextEA4 = spu_splats(cEA);
			uint32 tagMask, replIndex;
			vec_uint4 replMask;
			vec_uint4* const __restrict cSPUAddr = DoPrefetchLookup
				(
					cEAAligned, 
					&tagMask, 
					&replIndex, 
					cPrefetchCntlzRes, 
					nextEA4, 
					g_LRUCounter, 
					replMask, 
					cPrefAsyncGatherRes,
					cPrefOff
				);

			const vec_uint4 cAsyncDirSaved = g_SPUAsyncDir;//might be altered in FlushSingleCacheLine

			FlushSingleCacheLine(cSet, cReplIndex);//writes back if dirty

			//compare if some async output equals the prefetch
			const vec_uint4 cAsyncCmpRes = spu_cmpeq(nextEA4, cAsyncDirSaved);

			//set data for actual dir entry
			assert(cSet < g_SPUNumSets);
			SetCacheLineEA(g_pSPUCacheDir, cSet, cReplIndex, cEAAligned);
			const vec_uint4 cAnyHit	= spu_orx(cAsyncCmpRes);
			//update lru for the actual line in question
			UpdateLRU(g_pSPUCacheLRUCtrl[cSet], cReplIndex, g_LRUCounterIncr);//set current LRU counter value into index slot

			//sync with mfc_get cmd
			spu_writech(MFC_WrTagMask, tagMask);

			//get cache line number
			const uint32 cLine4 = (((cSet << scSPUCacheSetNumWaysShift) + cReplIndex) << (scSPUCacheLineSizeShift-4));

#if defined(DO_SPU_PROFILING)
			++NSPU::NDriver::g_PerfStats.cacheMisses;
#endif

			//if we have a match, nullify latest slot (could be done branch free -> turned out to be more expensive)
			IF(spu_extract(cAnyHit, 0) != 0, false)
			{
				//insert a null into replaced slot
				g_PrefetchLRUDirReg		= spu_sel(g_PrefetchLRUDir, (vec_uint4){0}, replMask);
				g_PrefetchDirReg			= spu_sel(g_PrefetchDir, (vec_uint4){0}, replMask);
				g_PrefetchLRUDir			= g_PrefetchLRUDirReg;
				g_PrefetchDir					= g_PrefetchDirReg;
			}

			//sync input transfer (polling or reading from a blocking channel)
			//in case we just hit the atomic Line previously written back, the sync is a waste but this case should 
			//	extremely rare happen
			MFC_SYNC(tagMask, cEAAligned);
			vec_uint4* const __restrict pCacheLine = (vec_uint4*)&g_pSPUCache[cLine4];//prefetch cache line ptr

			//copy contents into main and shadow area (save storage into shadow buffer to do later)
			IF(cPrefOff == 0, false)
				CopyCacheLine2Dest(pCacheLine, &g_pSPUShadowCache[cLine4], cSPUAddr);
			else
				CopyCacheLineAndPrepareNextPrefetch(pCacheLine, &g_pSPUShadowCache[cLine4], cSPUAddr, nextEA4, replIndex);
	#if defined(DO_SPU_PROFILING)
			NSPU::NDriver::g_PerfStats.memTransToLS += scSPUCacheLineSize;
	#endif

#if defined(DO_SPU_PROFILING)
			const uint32 cMissHandlerEndTime = spu_readch(SPU_RdDec);
			NSPU::NDriver::g_PerfStats.spuCacheMissTime += cMissHandlerStartTime - cMissHandlerEndTime;
#endif

			//executed in ASM otherwise (later execution possible and compiler can optimize more without IF
#if defined(ENABLE_HAZARD_MODE)
			IF(g_AtomicEAToStart, false)//afford branch miss since if atomic write back is in flight, it is not finished anyway
				DoStartAtomicWriteBack();  
#else
	#if defined(DO_SPU_PROFILING)
			IF(spu_extract(g_AtomicEAToStartReg, 0), false) 
				NSPU::NDriver::g_PerfStats.memTransFromLS += 128;
	#endif
#endif

			return (cLine4 << 4); 
		} 
#endif //SPU_CACHE_MISS_USE_ASM

		//look up and return data from the cache
		//	C version with hazard check, keep in sync with asm version (DoLookupCache)
		//  if the data is not currently in cache then transfer it from main memory 
		//  this code uses a conditional branch to the cache miss handler in the event that the requested data is not
		//should purely serve as reference impl.
#if defined(ENABLE_HAZARD_MODE)
		void* DoCheckedLookupCache(const uint32 cEA, const uint32 cLRUIncr, const int32 cPrefOff)
		{
#if defined(SUPP_DABR)
			const uint32 cCurDABRVal = *((uint32*)(void*)g_sDABR.lsAddr);
			if(g_sDABR.oldVal != cCurDABRVal)
			{
				snPause();
				g_sDABR.oldVal = cCurDABRVal;//update to continue
			}
#endif
			spu_CheckCacheHazard(cEA > (uint32)256 * 1024);
			//4 way check of aligned address -> move ea into all 4 slots
			const vec_uint4 cEAAligned4	= spu_splats(cEA & ~scSPUCacheLineSizeMask);
#if defined(DO_SPU_PROFILING)
			g_SPUCacheCurProfID = g_ProfID;
			((uint32*)g_SPUCacheProfIDCounter)[spu_extract(g_ProfID,0)] += 1;
#endif
			#if !defined(_NO_SPU_ASSERT)
				spu_CheckCacheHazard(GetStackAddress() > NSPU::g_sProgramTopLS + STACK_WARNING_VAL);
			#endif
			const int cSet		= GetCacheSetIndex(cEA);					//get set index
			assert(cSet < g_SPUNumSets);
			int indexInSet		= GetCacheIndexNum(SetCache4WayLookup(cSet, cEAAligned4));//lookup of all 4 entries if there is a hit
			g_LRUCounterIncr  = spu_add(g_LRUCounter, cLRUIncr);//only first component matters, but for performance do it this way
			int lineStartOff;
			IF(indexInSet < 0, false)
			{
				const uint32 cReplIndex = GetReplIndex(g_pSPUCacheLRUCtrl[cSet]);
				//precalc the same stuff as in the miss handler
				const vec_uint4 cCmpRes	= spu_cmpeq(g_PrefetchDir, cEAAligned4);
				vec_uint4 cntRes				= spu_cntlz(spu_gather(cCmpRes));
				const vec_uint4 cCmp		= spu_cmpeq(cEAAligned4, g_SPUAsyncDir);
				lineStartOff = NSPU::NCache::SPUCacheMissHandler(cEA, cSet, cntRes, cReplIndex, spu_gather(cCmp), cPrefOff);
				//verify prefetch cond so that nothing which is currently transferred back is prefetched
				//	wrong old data would be the result
				const vec_uint4 cPrefCmpRes	= spu_cmpeq(g_CurWrittenEA, g_PrefetchDir);//mask, if equal, reset LRU and dir entry
				g_PrefetchLRUDir						= spu_sel(g_PrefetchLRUDir, (vec_uint4)0, cPrefCmpRes);//reset LRU entry if equal
				g_PrefetchDir								= spu_sel(g_PrefetchDir, (vec_uint4)0, cPrefCmpRes);//reset dir entry if equal
			}
			else
			{
#if defined(DO_SPU_PROFILING)
				++NSPU::NDriver::g_PerfStats.cacheHits;
#endif
				//each set can have scSPUCacheSetNumWays entries, indexInSet tells which way is to be used
				indexInSet		= scSPUCacheSetNumWaysMask - indexInSet;
				lineStartOff	= (((cSet << scSPUCacheSetNumWaysShift) + indexInSet) << scSPUCacheLineSizeShift);
				//update LRU value
				UpdateLRU(g_pSPUCacheLRUCtrl[cSet], indexInSet, g_LRUCounterIncr);
			}
			IncrementLRUCounter();//increment LRU counter
			return (void*)(&((uint8*)g_pSPUCache)[lineStartOff + (cEA & scSPUCacheLineSizeMask)]);
		}
#endif

#if !defined(_NO_SPU_CACHE_ASSERT) 
		//checks if a spu address is truly present in the cache, used for debugging purpose
		const int GetCacheAssertAddr(const uint32 cSPUEA, const uint32 cEA, const char *cpFile, const uint32 cLine)
		{
			IF(cEA <= (uint32)256 * 1024, false)
			{
				printf("Cache assert: addr 0x%08x is invalid PPU addr.(%s:%d)\n", cEA, cpFile, cLine);
				return 0;
			}
			const int cSet = GetCacheSetIndex(cEA);
			const int cIndexInSet = GetCacheIndexNum
				(SetCache4WayLookup(cSet, spu_splats((uint32)(cEA) & ~scSPUCacheLineSizeMask)));
			IF(cIndexInSet  < 0, false)
			{
				printf("Cache assert: addr 0x%08x not in cache(file:%s:%d)\n", cEA, cpFile, cLine);
				return 0;
			}
			const int cLineStartOff	= (((cSet << scSPUCacheSetNumWaysShift) + (scSPUCacheSetNumWaysMask - cIndexInSet)) << scSPUCacheLineSizeShift);
			const uint32 cResultSPUAddr = (uint32)(void*)(&((uint8*)g_pSPUCache)[cLineStartOff + (cEA & scSPUCacheLineSizeMask)]);
			IF(cSPUEA != cResultSPUAddr, false)
			{
				printf("Cache addr.failure, exp: 0x%08x  resulting: 0x%08x, file:%s:%d\n",cSPUEA, cResultSPUAddr, cpFile, cLine);
				return 0;
			}
			return 1;
		}
#endif
	}//NCache
}//NSPU
 
#if !defined(SPU_CACHE_MISS_USE_ASM)
	//flushes the entire cache, do not call it several times
	void NSPU::CSPUMemMan::FlushCacheComplete(const int cDoSync, const bool cTransProfdata)
	{
	#if defined(DO_SPU_PROFILING)
		if(cTransProfdata && NSPU::NDriver::g_sInfoBlock.TransferProfDataBack())
			SPUProfDataHandler();//transfers the profiling data back to PPU
	#else
		(void)cTransProfdata;
	#endif
		//reset and flush each cache line
		//specialized implementation of FlushSingleCacheLine
		int index = 0;
		uint32 cacheEntry = 0;
		int i = 0;
		int set = 0;
		const int cSPUCacheEntries = (NCache::g_SPUNumSets << scSPUCacheSetNumWaysShift);
		si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));//write Cache Line size only once
		do
		{
			const uint32 cEA = GetCacheLineEA(g_pSPUCacheDir, set, index);
			SetCacheLineEA(g_pSPUCacheDir, set, index, 0);//reset required if it is called within job (not at the end)
			const vec_uint4 cEASplat4 = spu_splats(cEA);
			//test for cEA != 0 is moved together with IsCacheLineUnchanged to give the compiler more room for optimization
			//determine if cache line is dirty (meaning that at least one bit has changed in the cache line)
			const bool cIsCacheLineUnChanged = NCache::IsCacheLineUnchanged(cacheEntry, spu_promote(cEA, 0));
			const bool cTransferAsync = NCache::CheckWriteBackAsync(cEASplat4);
	#if defined(PREF_NO_WRITE_BACK)
			IF(cIsCacheLineUnChanged, true)
	#else
			IF(cIsCacheLineUnChanged, false)
	#endif
			{
	#if defined(DO_SPU_PROFILING)
				IF(cEA != 0, true)
					++NSPU::NDriver::g_PerfStats.cacheFlushsNoWrite;
	#endif
			}
			else
			{
	#if defined(SUPP_DABR)
				if(g_sDABR.ppuEA == cEA)
					snPause();
	#endif
				//cache line is altered
				IF(cTransferAsync, false)
				{
	#if defined(DO_SPU_PROFILING)
					NSPU::NDriver::g_PerfStats.memTransFromLS += scSPUCacheLineSize;
	#endif
					//inlined: mfc_put(&g_pSPUCache[cacheEntry], cEA, scSPUCacheLineSize, g_scDMAOutputTag, 0, 0);//start async write back
					si_wrch(MFC_LSA,si_from_ptr(&g_pSPUCache[cacheEntry]));
					//si_wrch(MFC_EAH,si_from_uint((uint64)(&g_pSPUCache[cacheEntry])>>32));
					si_wrch(MFC_EAL,si_from_uint(cEA));
					si_wrch(MFC_TagID,si_from_uint(g_scDMAOutputTag));
					si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//toggle write back

					NCache::g_CurSPUAsyncTag = 0xFFFFFFFF;
	#if defined(DO_SPU_PROFILING)
					++NSPU::NDriver::g_PerfStats.cacheWritesBackASync;
	#endif
				}
				else
				{
					NCache::SyncAtomicDCache();//2nd step of atomic write back
	#if defined(DO_SPU_PROFILING)
					NSPU::NDriver::g_PerfStats.memTransToLS += 128;
	#endif
					//copy current main memory contents here again, necessary Calls for mfc_getllar(g_scWriteBackArea, cEA, 0, 0);
					si_wrch(MFC_LSA,si_from_ptr(g_scWriteBackArea));
					si_wrch(MFC_EAL,si_from_uint(mfc_ea2l(cEA)));
					si_wrch(MFC_TagID,si_from_uint(0));
					mfc_getllar_again();

					NCache::CopyCacheLine((vec_uint4*)g_scWriteBackMaskAtomic, (vec_uint4*)g_scWriteBackMask);//copy write back mask
					NCache::g_CurAtomicEA = cEA;
					//save current contents
					NCache::CopyCacheLine(g_scWriteBackSavedArea, &g_pSPUCache[cacheEntry]);
	#if defined(DO_SPU_PROFILING)
					++NSPU::NDriver::g_PerfStats.cacheWritesBackSync;
	#endif
					NCache::DoStartAtomicWriteBack();
				}
			}
			++i;	
			index = (index + 1)	& scSPUCacheSetNumWaysMask;
			set   = (i >> scSPUCacheSetNumWaysShift);
			cacheEntry += (scSPUCacheLineSize>>4);
		}
		WHILE(i < cSPUCacheEntries, true);

	#if !defined(SPU_CACHE_MISS_USE_ASM)
		g_CurWrittenEA		= (vec_uint4){0};
	#endif

		//it is not syncd since we use a barrier transfer in the driver
		//the DMAC can have only 16 outstanding transfers which should suffice 
		//sync last but one transfer, set tag update and poll for completion 
		const bool cSyncTransfer				= cDoSync && (NCache::g_CurSPUAsyncTag == 0xFFFFFFFF);
		IF(cSyncTransfer, false)
		{
			const uint32 cTagMask						= (1<<g_scDMAOutputTag);
			spu_writech(MFC_WrTagMask, cTagMask);//sync all at once
			MFC_SYNC(cTagMask, 0xFFFFFFFF);
		}
	#if defined(DO_SPU_PROFILING)
		//init profiling data
		if(cTransProfdata)
		{
			for(unsigned int i=0; i<MAX_PROF_ID * (4*3) / sizeof(vec_uint4); ++i)
				NCache::g_SPUCacheProfIDCounter[i] = (vec_uint4){0};
			NCache::g_SPUCacheCurProfID = (vec_uint4){0};
		}
	#endif
		g_LRUCounter			= spu_splats((uint32)0);//reset LRU counter to avoid wrap around
		//reset prefetches
		NCache::g_PrefetchLRUDir	= spu_splats((uint32)0);
		NCache::g_PrefetchDir			= spu_splats((uint32)0);
		NCache::SyncAtomicDCache();//2nd step of atomic write back
	}
#endif//SPU_CACHE_MISS_USE_ASM

//---------------------------------------------printf management------------------------------------------------

//ILog s_Log;

#if defined(SUPP_PRINTF)

#include "vsprintf_spu.cpp"

//workflow is as follows:
//		transfer buffer to dedicated area on PPU
//		sync transfer 
//		toggle interrupt
//		wait for answer from PPU
void SPUPrintfHandler(const char *cpFormat,...)
{
	va_list	argList;
	char buf[SPU_PRINTF_BUF_SIZE] _ALIGN(128);
	va_start(argList, cpFormat);
	vsprintf_spu(buf, cpFormat, argList);
	va_end(argList);
	mfc_put((void*)(uintptr_t)buf, NSPU::NDriver::g_DestPrintfAreaEA, SPU_PRINTF_BUF_SIZE, g_scDMAPPUPrintfTag, 0, 0);
	SyncMemory(g_scDMAPPUPrintfTag);
	spu_writech(SPU_WrOutIntrMbox, (EVENT_PRINTF_PORT_CUSTOM << EVENT_PORT_SHIFT));
	spu_readch(SPU_RdSigNotify1);
}
#endif //SUPP_PRINTF

//---------------------------------------------profiling data management----------------------------------------

#if defined(DO_SPU_PROFILING)

//workflow is as follows:
//		transfer buffer to dedicated area on PPU
//		sync transfer 
//		toggle interrupt
//		wait for answer from PPU
void SPUProfDataHandler()
{
	uint32 transferSizeLeft = MAX_PROF_ID * (4*3);
	uint32 curDest = NSPU::NDriver::g_DestProfAreaEA;
	uint8 *pCurLS = (uint8*)(uintptr_t)NSPU::NCache::g_SPUCacheProfIDCounter;
	do 
	{
		uint32 curTransferSize = (transferSizeLeft>16*1024)?16*1024 : transferSizeLeft;
		mfc_put((void*)pCurLS, curDest, curTransferSize, g_scDMAPPUProfTag, 0, 0);
		curDest += curTransferSize;
		pCurLS	+= curTransferSize;
		transferSizeLeft -= curTransferSize;
	} 
	while(transferSizeLeft > 0);
	SyncMemory(g_scDMAPPUProfTag);
	spu_writech(SPU_WrOutIntrMbox, (uint32)(uint8)NSPU::NDriver::g_sInfoBlock.jobId | (EVENT_PROF_PORT << EVENT_PORT_SHIFT));
	NSPU::NCache::g_SPUCacheCurProfID = (vec_uint4){0};
	spu_readch(SPU_RdSigNotify1);
}

#endif

#endif //__SPU__
#endif //PS3
