#if defined(PS3)

#define STACK_WARNING_VAL 511

#define NOP nop $127
#define STOPD stop 254

//look up and return data from the cache
//  if the data is not currently in cache, then call the cache miss handler:
//		const int SPUCacheMissHandler
//		(
//			const unsigned int cEA, //new address
//			const uint16 cSet,			//set where to replace
//			const vec_uint4 cPrefetchCntlzRes,//spu_cntlz(spu_gather(cCmpRes))
//			const uint32 cReplIndex //replacement index according to GetReplIndex(g_SPUCacheLRUCtrl[cSet])
//			const vec_uint4 cPrefAsyncGatherRes //spu_gather(spu_cmpeq(cEASplat4, g_SPUAsyncDir))
//		)
//
//	cSet and cEA are the only really required parameters, the rest are precomputed for performance reason
//		because there were empty slots due to the branch hint (which must be at least 8 instr. before the branch)
//
//	asm version without hazard check, keep in sync with C version (DoLookupCacheChecked)
//
//	some code optimizations are: cSet is shifted and masked so that it directly results in cSet << 4 (address as offset)
//  cGatherRes .. cSubRes are replaced by masking using the cCmpRes and orx to retrieve the correct g_sSPUCacheMem[cLineOff]
//
//in post miss handler code, the prefetch dir is tested against the last written back EA (stored in g_CurWrittenEA)
//
//to force the compiler to insert the branch hint for the lookup function, the call is actually:
//
//		DoLookupCache((ea), (ea) & ~scSPUCacheLineSizeMask, (ea)>>5(3) & ((num_sets -1)<<4), cPrefOff, LRU increment, 66051)
//								   $3              $4                         $5														$6           $7         $8
//	this way, we just force to evaluate the first few instructions into parameters
//	branch latency also ensures stallfree processing in terms of first asm statements and parameters

//behind each instruction: pipeline (0/1) / latency(cycles)

	.file	"DoCacheLookup_spu.S"
	
.text
	.align 7
	.global	DoCacheMiss
	.type	DoCacheMiss, @function
DoCacheMiss:
#if defined(SUPP_SN) && !defined(_NO_SPU_ASSERT)
	rotmi	$25,$3,-10			//0/4		cEA / 1024
	clgti $25,$25,256			//0/2		(cEA > 256*1024)?
	brnz	$25,.LSTestPassed//1/4 if(cEA > 256*1024) continue processing
	NOP										//0/0	
	stop 255							//1/4		custom snPause()
.LSTestPassed:	
#endif
#if defined(SUPP_DABR)
	lqa $25,_ZN4NSPU7NDriver7g_sDABRE		//1/6		load g_sDABR
	lqd $26,0($25)				//1/6		const uint32 cCurDABRVal = *((uint32*)(void*)g_sDABR.lsAddr)
	rotqbyi $27, $25, 4		//1/4		rotate g_sDABR.oldVal into preferred slot
	rotqby $26, $26, $25	//1/4		rotate cCurDABRVal into preferred slot	
	ceq $28, $27, $26			//0/2		g_sDABR.oldVal == cCurDABRVal
	brnz $28,.DABRContMiss//1/4		if(g_sDABR.oldVal == cCurDABRVal) do nothing
	stop 255							//0/0		snPause()
	cwd		$29,4($sp)			//1/4		generate controls for insertion of g_sDABR.oldVal into cCurDABRVal
	shufb	$30,$26,$25,$29	//1/4		spu_insert(cCurDABRVal,g_sDABR,4)
	stqa $30,_ZN4NSPU7NDriver7g_sDABRE		//1/6		store updated g_sDABR	
.DABRContMiss:
#endif
#if defined(DO_SPU_PROFILING)
	ila	$55,_ZN4NSPU6NCache23g_SPUCacheProfIDCounterE		//0/2	load NSPU::NDriver::g_SPUCacheProfIDCounter
	lqa	$36,_ZN4NSPU7NDriver11g_PerfStatsE			//1/6		load NSPU::NDriver::g_PerfStats	
	shli $56,$70,2				//0/4			g_ProfID << 2 for store mask
	lqa	$27,_ZN4NSPU6NCache17g_SPUCacheHitIncrE	//1/6		load NSPU::NCache::g_SPUCacheHitIncr
	NOP										//0/0	
	stqa $70,_ZN4NSPU6NCache19g_SPUCacheCurProfIDE//1/6	save current profile ID
#endif		
  andi	$22,$3,127			//0/2			const unsigned int cEAOff	= cEA & 127;
  hbra	.LookupMissret,.CacheMiss		//1/10		branch hint for instructions past function call for miss case	
	shli	$25,$5,5				//0/4			const uint32 cSetShiftedBy9 = cSet << 9;
	lqx		$12,$5,$76			//1/6			load g_SPUCacheDir[cSet] = g_SPUCacheDir + cSetmul16
	ori		$21, $4, 0			//0/2			save const uint32 cEAAligned = cEA & ~scSPUCacheLineSizeMask
	shufb	$39,$4,$4,$8		//1/4			const vec_uint4 cEAAligned4	= spu_splats(cEAAligned);		
	rotmi	$4,$5,-4				//0/4			store cSet (from cSetmul16 >> 4) into register slot as expected by SPUCacheMissHandler_func
	lqx	$31,$5,$73				//1/6			load g_SPUCacheLRUCtrl[cSet] = g_SPUCacheLRUCtrl + cSetmul16
	il	$48,3							//0/2			load 3 for GetReplIndex
	lqa $37, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6			load NCache::g_PrefetchDir
	a		$25, $22, $25			//0/2			(cSet * 128 * 16) + (cEA & 127)
	shufb	$7,$7,$7,$8			//1/4			const vec_uint4 cLRUIncr4	= spu_splats(cLRUIncr);			
	ceq	$12,$12,$39				//0/2			const vec_uint4 cCmpRes	= spu_cmpeq(g_SPUCacheDir[cSet], cEAAligned4);
	lqa $53, _ZN4NSPU6NCache13g_SPUAsyncDirE		//1/6			load NCache::g_SPUAsyncDir
	ila $72, 262143				//0/2			load LS size for ea comparison
#if defined(DO_SPU_PROFILING)
	lqx	$57,$56,$55				//1/6			load NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
	a	$26,$36,$27					//0/2			++NSPU::NDriver::g_PerfStats.cacheHits;//as uint32
	cwx	$59,$56,$55				//1/4			generate control mask for profile id insertion
	a		$58,$56,$55				//0/2			gen address &NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
	stqa $26,_ZN4NSPU7NDriver11g_PerfStatsE	//1/6		store NSPU::NDriver::g_PerfStats
	NOP										//0/0		
#endif	
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	rotqbyi $8, $6, 0			//1/4			move cPrefOff into parameter register for miss handler
#else	
	rotqbyi $9, $6, 0			//1/4			move cPrefOff into $9 as expected by SPUCacheMissHandler_spu.S
#endif	
	and $34, $74, $12			//0/2			mask the line offset according to address match
	gb  $35, $12					//1/4			gather bits to determine if to branch to cache miss handler ($35 == 0)
	a	$79,$75,$7					//0/2			g_LRUCounterIncr = spu_add(g_LRUCounter, cLRUIncr)
	rotqbyi	$40,$31,4			//1/4			const vec_uint4 cVal1	= spu_rlqwbyte(cReplCtrlValues, 4) (for GetReplIndex)
	ceq	$6, $39, $37			//0/2			const vec_uint4 cCmpRes = spu_cmpeq(g_PrefetchDir, cEAAligned4);	
	orx $34, $34					//1/4			move into slot 0
	selb $33,$31,$79,$12	//0/2			mask bits from counter and existing LRU control for cSet
	rotqbyi	$41,$31,8			//1/4			const vec_uint4 cVal2 = spu_rlqwbyte(cReplCtrlValues, 8) (for GetReplIndex)
	ceq	$53, $53, $39			//0/2			spu_cmpeq(cEASplat4, g_SPUAsyncDir)
#if defined(DO_SPU_PROFILING)
	rotqby	$2,$57,$58		//1/4			rotate NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID] into pref.slot
	ai	$2,$2,1						//0/2			((uint32*)g_SPUCacheProfIDCounter)[spu_extract(g_ProfID,0)] += 1
	shufb	$57,$2,$57,$59	//1/4			shuffle back
	NOP										//0/0			
	stqx	$57,$56,$55			//1/6			store updated NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
	NOP										//0/0			
#endif	
.LookupMissret:
	brz	$35,.CacheMiss		//1/4			if(cIndexInSet < 0) branch (if none of the addresses have matched -> $35 == 0)
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)
	stqx $33,$5,$73				//1/6			store updated g_SPUCacheLRUCtrl[cSet] = g_SPUCacheLRUCtrl + cSetmul16
	a		$3, $25, $34			//0/2			return (void*)(&NSPU::NCache::g_SPUCacheArray[lineStartOff + (cEA & scSPUCacheLineSizeMask)])
	bi	$lr								//1/4			return pRet;	
	.size	DoCacheMiss, .-DoCacheMiss
	
.text
	.align 7
	.global	DoLookupCache
	.type	DoLookupCache, @function
DoLookupCache:
#if defined(SUPP_SN) && !defined(_NO_SPU_ASSERT)
	rotmi	$25,$3,-10			//0/4		cEA / 1024
	clgti $25,$25,256			//0/2		(cEA > 256*1024)?
	brnz	$25,.LSTestPassed1//1/4 if(cEA > 256*1024) continue processing
	NOP										//0/0	
	stop 255							//1/4		custom snPause()
.LSTestPassed1:	
#endif
#if defined(SUPP_DABR)
	lqa $25,_ZN4NSPU7NDriver7g_sDABRE		//1/6		load g_sDABR
	lqd $26,0($25)				//1/6		const uint32 cCurDABRVal = *((uint32*)(void*)g_sDABR.lsAddr)
	rotqbyi $27, $25, 4		//1/4		rotate g_sDABR.oldVal into preferred slot
	rotqby $26, $26, $25	//1/4		rotate cCurDABRVal into preferred slot	
	ceq $28, $27, $26			//0/2		g_sDABR.oldVal == cCurDABRVal
	brnz $28,.DABRCont		//1/4		if(g_sDABR.oldVal == cCurDABRVal) do nothing
	stop 255							//0/0		snPause()
	cwd		$29,4($sp)			//1/4		generate controls for insertion of g_sDABR.oldVal into cCurDABRVal
	shufb	$30,$26,$25,$29	//1/4		spu_insert(cCurDABRVal,g_sDABR,4)
	stqa $30,_ZN4NSPU7NDriver7g_sDABRE		//1/6		store updated g_sDABR	
.DABRCont:
#endif
#if defined(DO_SPU_PROFILING)
	ila	$55,_ZN4NSPU6NCache23g_SPUCacheProfIDCounterE		//0/2	load NSPU::NDriver::g_SPUCacheProfIDCounter
	lqa	$36,_ZN4NSPU7NDriver11g_PerfStatsE			//1/6		load NSPU::NDriver::g_PerfStats	
	shli $56,$70,2				//0/4			g_ProfID << 2 for store mask
	lqa	$27,_ZN4NSPU6NCache17g_SPUCacheHitIncrE	//1/6		load NSPU::NCache::g_SPUCacheHitIncr
	NOP										//0/0	
	stqa $70,_ZN4NSPU6NCache19g_SPUCacheCurProfIDE//1/6	save current profile ID
#endif		
	shli	$25,$5,5				//0/4			const uint32 cSetShiftedBy9 = cSet << 9;
	lqx		$12,$5,$76			//1/6			load g_SPUCacheDir[cSet] = g_SPUCacheDir + cSetmul16
	ori		$21, $4, 0			//0/2			save const uint32 cEAAligned = cEA & ~scSPUCacheLineSizeMask
	shufb	$39,$4,$4,$8		//1/4			const vec_uint4 cEAAligned4	= spu_splats(cEAAligned);		
	andi	$22,$3,127			//0/2			const unsigned int cEAOff	= cEA & 127;
	hbr	.NoMissret,$lr		//1/10		branch hint for instructions past function call for non miss case	
	rotmi	$4,$5,-4				//0/4			store cSet (from cSetmul16 >> 4) into register slot as expected by SPUCacheMissHandler_func
	lqx	$31,$5,$73				//1/6			load g_SPUCacheLRUCtrl[cSet] = g_SPUCacheLRUCtrl + cSetmul16
	il	$48,3							//0/2			load 3 for GetReplIndex
	lqa $37, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6			load NCache::g_PrefetchDir
	a		$25, $22, $25			//0/2			(cSet * 128 * 16) + (cEA & 127)
	shufb	$7,$7,$7,$8			//1/4			const vec_uint4 cLRUIncr4	= spu_splats(cLRUIncr);			
	ceq	$12,$12,$39				//0/2			const vec_uint4 cCmpRes	= spu_cmpeq(g_SPUCacheDir[cSet], cEAAligned4);
	lqa $53, _ZN4NSPU6NCache13g_SPUAsyncDirE		//1/6			load NCache::g_SPUAsyncDir
	ila $72, 262143				//0/2			load LS size for ea comparison
#if defined(DO_SPU_PROFILING)
	lqx	$57,$56,$55				//1/6			load NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
	a	$26,$36,$27					//0/2			++NSPU::NDriver::g_PerfStats.cacheHits;//as uint32
	cwx	$59,$56,$55				//1/4			generate control mask for profile id insertion
	a		$58,$56,$55				//0/2			gen address &NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
	stqa $26,_ZN4NSPU7NDriver11g_PerfStatsE	//1/6		store NSPU::NDriver::g_PerfStats
	NOP										//0/0		
#endif	
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	rotqbyi $8, $6, 0			//1/4			move cPrefOff into parameter register for miss handler
#else	
	rotqbyi $9, $6, 0			//1/4			move cPrefOff into $9 as expected by SPUCacheMissHandler_spu.S
#endif	
	and $34, $74, $12			//0/2			mask the line offset according to address match
	gb  $35, $12					//1/4			gather bits to determine if to branch to cache miss handler ($35 == 0)
	a	$79,$75,$7					//0/2			g_LRUCounterIncr = spu_add(g_LRUCounter, cLRUIncr)
	rotqbyi	$40,$31,4			//1/4			const vec_uint4 cVal1	= spu_rlqwbyte(cReplCtrlValues, 4) (for GetReplIndex)
	ceq	$6, $39, $37			//0/2			const vec_uint4 cCmpRes = spu_cmpeq(g_PrefetchDir, cEAAligned4);	
	orx $34, $34					//1/4			move into slot 0
	selb $33,$31,$79,$12	//0/2			mask bits from counter and existing LRU control for cSet
	rotqbyi	$41,$31,8			//1/4			const vec_uint4 cVal2 = spu_rlqwbyte(cReplCtrlValues, 8) (for GetReplIndex)
	ceq	$53, $53, $39			//0/2			spu_cmpeq(cEASplat4, g_SPUAsyncDir)
#if defined(DO_SPU_PROFILING)
	rotqby	$2,$57,$58		//1/4			rotate NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID] into pref.slot
	ai	$2,$2,1						//0/2			((uint32*)g_SPUCacheProfIDCounter)[spu_extract(g_ProfID,0)] += 1
	shufb	$57,$2,$57,$59	//1/4			shuffle back
//	NOP										//0/0			
	stqx	$57,$56,$55			//1/6			store updated NSPU::NDriver::g_SPUCacheProfIDCounter[g_ProfID]
//	NOP										//0/0			
#endif	
	brz	$35,.CacheMiss		//1/4			if(cIndexInSet < 0) branch (if none of the addresses have matched -> $35 == 0)
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)
	stqx $33,$5,$73				//1/6			store updated g_SPUCacheLRUCtrl[cSet] = g_SPUCacheLRUCtrl + cSetmul16
	a		$3, $25, $34			//0/2			return (void*)(&NSPU::NCache::g_SPUCacheArray[lineStartOff + (cEA & scSPUCacheLineSizeMask)])
.NoMissret:
	bi	$lr								//1/4			return pRet;
	.align 6	
.CacheMiss:
	clgt	$43,$31,$40			//0/2			spu_cmpgt(cVal0, cVal1) (for GetReplIndex)	
	rotqbyi	$42,$31,12		//1/4			const vec_uint4 cVal3 = spu_rlqwbyte(cReplCtrlValues, 12) (for GetReplIndex)	
	il	$47,2							//0/2			load 2 for GetReplIndex
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	hbra .MissHandlerBranch, _ZN4NSPU6NCache19SPUCacheMissHandlerEjjU8__vectorjjS1_i	//1/10 branch hint for cach miss handler		
#else
	gb  $5, $6						//1/4			gather bits: spu_gather(cCmpRes)
#endif	
	selb	$45,$31,$40,$43	//0/2			const vec_uint4 cCmpSelRes01 = spu_sel(cVal0, cVal1, cCmpVec01)	(for GetReplIndex)
#if !defined(SPU_CACHE_MISS_USE_ASM)		
	rotqbyi $17, $80, 0		//1/4			save register $80 (to save orig. content after modifying it)		
	a	$80,$22,$77					//0/2			&g_sSPUCacheMem + cEAOff		
	lnop									//1/0		
	clgt $72, $3, $72			//0/2			(ea >= 256*1024)?
	lnop									//1/0		
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)
#else
	lnop									//1/0	
	clgt $72, $3, $72			//0/2			(ea >= 256*1024)?
	lnop									//1/0		
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)
	lnop									//1/0	
	il $24, 128						//0/2			load cache line size constant for $ch19, no need to set ever again then in asm
#endif
#if !defined(SPU_CACHE_MISS_USE_ASM)
	gb  $5, $6						//1/4			gather bits: spu_gather(cCmpRes)
#else	
	shufb	$10,$3,$3,$8		//1/4			vec_uint4 nextEA4 = spu_splats(cEA)	
#endif	
	andi	$49, $43, 1			//0/2			const vec_uint4 cCmpIndexRes01 = spu_sel((vec_uint4)0, (vec_uint4)1, cCmpVec01) (for GetReplIndex)
#if !defined(SPU_CACHE_MISS_USE_ASM)
	stqd	$81,-48($sp)		//1/6			save register 81 as required by ABI (faster restore for $lr than by lq, -48 = 32-80)
#else
	lqa	$66,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6		preload g_CurAtomicEA for asm miss handler
#endif	
	clgt	$44, $41,$42		//0/2			const vec_uint4 cCmpVec23	= spu_cmpgt(cVal2, cVal3) (for GetReplIndex) 
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	stqd	$sp,-80($sp)		//1/6			store stack pointer for SPUCacheMissHandler_func
	ai	$sp,$sp,-80				//0/2			decrement stack for SPUCacheMissHandler_func
#else	
	lqa	$23,_ZN4NSPU6NCache16g_PrefetchLRUDirE		//1/6		preload g_PrefetchLRUDir for asm cache miss handler
	clz	$5, $5						//0/2			const vec_uint4 cCntRes = spu_cntlz(spu_gather(cCmpRes));	
#endif	
	biz $72, $lr					//1/4			if(ea < 256*1024)return ea
	selb	$46,$41,$42,$44	//0/2			const vec_uint4 cCmpSelRes23 = spu_sel(cVal2, cVal3, cCmpVec23) (for GetReplIndex)
#if !defined(SPU_CACHE_MISS_USE_ASM)
	rotqbyi	$81, $lr, 0		//1/4			move $lr -> $81
#else	
	lnop									//1/0
#endif	
	selb	$51,$47,$48,$44	//0/2			const vec_uint4 cCmpIndexRes23 = spu_sel((vec_uint4)2, (vec_uint4)3, cCmpVec23) (for GetReplIndex)
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	stqd	$17,64($sp)			//1/6			save register 80 (saved into $17) as required by ABI	
#else	
	wrch $ch19,$24				//1/6			si_wrch(MFC_Size,si_from_uint(128))
#endif	
	clgt	$50, $45,$46		//0/2			const vec_uint4 cCmpVec0123 = spu_cmpgt(cCmpSelRes01, cCmpSelRes23) (for GetReplIndex)
	gb		$7, $53					//1/4			const vec_uint4 cPrefAsyncGatherRes = spu_gather(spu_cmpeq(cEASplat4, g_SPUAsyncDir));
#if !defined(SPU_CACHE_MISS_USE_ASM)
	clz	$5, $5						//0/2			const vec_uint4 cCntRes = spu_cntlz(spu_gather(cCmpRes));
#else	
	ceqi $11,$5,32				//0/2			spu_extract(cPrefetchCntlzRes, 0) == 32		
#endif	
#if defined(DO_SPU_PROFILING)
	stqa  $36,_ZN4NSPU7NDriver11g_PerfStatsE	//1/6		restore NSPU::NDriver::g_PerfStats (without increment)
#else	
	lqa $26, _ZN4NSPU6NCache20g_AsyncRangesDirFromE		//1/6			preload g_AsyncRangesDirFrom for asm cache miss handler
#endif
	selb	$6,$49,$51,$50	//0/2			const vec_uint4 cCmpSelRes0123 = spu_sel(cCmpIndexRes01, cCmpIndexRes23, cCmpVec0123) (for GetReplIndex)	
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	lnop									//1/0
	ai $75,$75,-1					//0/2		revert early increment to g_LRUCounter due to LS lookup handling
	//in case of a call to StartAtomicWriteBack(), the epilogue code is executed within the .MissHandlerAtomicWriteBack section
.MissHandlerBranch:	
	brsl	$lr,_ZN4NSPU6NCache19SPUCacheMissHandlerEjjU8__vectorjjS1_i	//1/4		SPUCacheMissHandler_func(cEA, cSet, cSize);
#else
	//it returtns either inside or straight to MissHandlerAtomicWriteBack
	#include "SPUCacheMissHandler_spu.S"	//asm impl. of SPUCacheMissHandler, reuses all registers set
.SPUCacheMissHandlerEnd:	
#endif	
	andc $24, $24, $24		//0/2			$24 = 0
	hbr	.Missret,$81			//1/10		branch hint for instructions past function call for miss case
	ila		$50, 261632			//0/2			load g_scWriteBackMaskAtomic = ((vec_uint4*)(256 * 1024 - 4*scSPUCacheLineSize))
	brnz $70, .MissHandlerAtomicWriteBack		//1/4		IF(g_AtomicEAToStart, false) StartAtomicWriteBack();
	ceq	$23,$72,$68				//0/2			const vec_uint4 cPrefCmpRes	= spu_cmpeq(g_CurWrittenEA, g_PrefetchDir);
	rotqbyi	$lr, $81, 0		//1/4			restore link register	
	il $69, 28						//0/2			gen a 28 for prefetch hits in asm cache miss handler
	lnop									//1/0
	selb $25,$68,$24,$23	//0/2			g_PrefetchDir	= spu_sel(g_PrefetchDir, (vec_uint4)0, cPrefCmpRes);
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI
	a	$3,$80,$3						//0/2			const unsigned int cLineOff		= &g_sSPUCacheMem + cEAOff + lineStartOff;
	lnop									//1/0	
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)
	selb $26,$69,$24,$23	//0/2			g_PrefetchLRUDir	= spu_sel(g_PrefetchLRUDir, (vec_uint4)0, cPrefCmpRes);
	lnop									//1/0	
#if defined(_NO_SPU_ASSERT)	
	ai	$sp,$sp,80				//0/2			restore stack pointer		
#else	
	NOP										//0/0	
#endif		
	stqa	$25, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6			store updated NCache::g_PrefetchDir
//	NOP										//0/0
	stqa	$26, _ZN4NSPU6NCache16g_PrefetchLRUDirE	//1/6			store updated NCache::g_PrefetchLRUDir
//	NOP										//0/0
#if !defined(_NO_SPU_ASSERT)
	//implement: if(GetStackAddress() <= NSPU::g_sProgramTopLS + STACK_WARNING_VAL) StackAssertFunc();
	ori $80, $3, 0				//0/2			save return value
	lqa $4, _ZN4NSPU15g_sProgramTopLSE	//1/6		load NSPU::g_sProgramTopLS
	ori $81, $lr, 0				//0/2			save link register
	lnop									//1/0		
	ai $4, $4, STACK_WARNING_VAL		//0/2		NSPU::g_sProgramTopLS + STACK_WARNING_VAL
	lnop									//1/0
	cgt $4, $4, $sp				//0/2			NSPU::g_sProgramTopLS + STACK_WARNING_VAL >= GetStackAddress()
	brz $4, .NoStackFailed0//0/2			if(NSPU::g_sProgramTopLS + STACK_WARNING_VAL < GetStackAddress()) do not call StackAssertFunc()
	NOP										//0/0	
	brsl	$lr, _Z15StackAssertFuncv	//1/4  //StackAssertFunc()
#if defined(SUPP_SN)
	lqa	$54,16							//1/6		load contents of upper 16 bytes of SPUDriver image spu_mod_hdr
	cwd	$55,12($sp)					//1/4		generate control word for insertion into spu_mod_hdr->pad
	shufb	$54,$lr,$54,$55		//1/4		insert branch target
	stqa	$54,16						//1/6		store updated spu_mod_hdr
	NOP											//0/0										
	lqa	$54,16							//1/6		make sure it has completed storage before stop
	ori	$3, $80, 0				//0/2			restore return value
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
	ori	$lr, $81, 0				//0/2			restore link register
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI	
	ai	$sp,$sp,80				//0/2			restore stack pointer
	STOPD										//1/4		custom snPause()
	bi	$lr									//1/4		branch to link register cont.
#endif
.NoStackFailed0:		
	ori	$3, $80, 0				//0/2			restore return value
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
	ori	$lr, $81, 0				//0/2			restore link register
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI	
	ai	$sp,$sp,80				//0/2			restore stack pointer
#endif //_NO_SPU_ASSERT
.Missret:	
	bi	$lr								//1/4			return pRet;
//StartAtomicWriteBack() code	
.MissHandlerAtomicWriteBack:
	//be careful when putting pipeline 1 instuctions together that the dual issue is 8 byte aligned again
	//this branch miss and load sequence should basically not cost anything since we are MFC bound here
	ila		$60, 262016			//0/2			load g_scWriteBackSavedArea = ((vec_uint4*)(256 * 1024 - scSPUCacheLineSize))
	lqd		$51, 0($50)			//1/6			load const vec_uint4 cWriteBackMasks0 = cpWriteBackMask[0];
	lqd		$52, 16($50)		//1/6			load const vec_uint4 cWriteBackMasks1 = cpWriteBackMask[1];
	lqd		$53, 32($50)		//1/6			load const vec_uint4 cWriteBackMasks2 = cpWriteBackMask[2];
	lqd		$54, 48($50)		//1/6			load const vec_uint4 cWriteBackMasks3 = cpWriteBackMask[3];
	lqd		$55, 64($50)		//1/6			load const vec_uint4 cWriteBackMasks4 = cpWriteBackMask[4];
	lqd		$56, 80($50)		//1/6			load const vec_uint4 cWriteBackMasks5 = cpWriteBackMask[5];
	lqd		$57, 96($50)		//1/6			load const vec_uint4 cWriteBackMasks6 = cpWriteBackMask[6];
	lqd		$58, 112($50)		//1/6			load const vec_uint4 cWriteBackMasks7 = cpWriteBackMask[7];
	lqd		$61, 0($60)			//1/6			load const vec_uint4 cWriteBackSavedAreas0 = cpWriteBackSavedArea[0];
	ila		$50, 261888			//0/2			load g_scWriteBackArea = ((vec_uint4*)(256 * 1024 - 2*scSPUCacheLineSize))		
	lqd		$62, 16($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas1 = cpWriteBackSavedArea[1];
#if !defined(SPU_CACHE_MISS_USE_ASM)
	or		$lr,$81, $81		//0/2			restore link register		
#else	
	NOP										//0/0
#endif
	lqd		$12, 32($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas2 = cpWriteBackSavedArea[2];
	lqd		$13, 48($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas3 = cpWriteBackSavedArea[3];
	lqd		$18, 64($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas4 = cpWriteBackSavedArea[4];
	lqd		$19, 80($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas5 = cpWriteBackSavedArea[5];
	lqd		$20, 96($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas6 = cpWriteBackSavedArea[6];
	lqd		$21, 112($60)		//1/6			load const vec_uint4 cWriteBackSavedAreas7 = cpWriteBackSavedArea[7];
	rdch	$2,	 $ch27			//1/6			mfc_read_atomic_status();//sync for pWriteBackArea		
#if !defined(SPU_CACHE_MISS_USE_ASM)
	ceq	  $23, $72,$68		//0/2			const vec_uint4 cPrefCmpRes	= spu_cmpeq(g_CurWrittenEA, g_PrefetchDir);
#else	
	NOP										//0/0	
#endif 	
	lqd		$32, 16($50)		//1/6			load const vec_uint4 cWriteBackAreas1 = pWriteBackArea[1];
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	andc  $24, $24, $24		//0/2			$24 = 0
#else
	NOP										//0/0		
#endif 	
	lqd		$31,  0($50)		//1/6			load const vec_uint4 cWriteBackAreas0 = pWriteBackArea[0];
	NOP										//0/0		
#if !defined(SPU_CACHE_MISS_USE_ASM)
	hbr	.MissHandlerAtomicWriteBackReturn,$81			//1/10		branch hint for instructions past function call for miss case
#else	
	hbr	.MissHandlerAtomicWriteBackReturn,$lr			//1/10		branch hint for instructions past function call for miss case	
#endif	
#if !defined(SPU_CACHE_MISS_USE_ASM)	
	selb  $25, $68,$24,$23//0/2			g_PrefetchDir	= spu_sel(g_PrefetchDir, (vec_uint4)0, cPrefCmpRes);	
#else
	NOP										//0/0		
#endif 	
#if !defined(SPU_CACHE_MISS_USE_ASM)
	lqd	  $81,32($sp)			//1/6			restore register 81 as required by ABI
	a	    $3,$80,$3				//0/2			const unsigned int cLineOff		= &g_sSPUCacheMem + cEAOff + lineStartOff;
#endif	
	lqd		$33, 32($50)		//1/6			load const vec_uint4 cWriteBackAreas2 = pWriteBackArea[2];
//	NOP										//0/0
	lqd		$34, 48($50)		//1/6			load const vec_uint4 cWriteBackAreas3 = pWriteBackArea[3];
//	NOP										//0/0		
	lqd		$35, 64($50)		//1/6			load const vec_uint4 cWriteBackAreas4 = pWriteBackArea[4];
#if !defined(SPU_CACHE_MISS_USE_ASM)
	NOP										//0/0		
	lqd	  $80, 64($sp)		//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
#endif	
#if !defined(SPU_CACHE_MISS_USE_ASM)
	selb  $26, $69,$24,$23//0/2			g_PrefetchLRUDir	= spu_sel(g_PrefetchLRUDir, (vec_uint4)0, cPrefCmpRes);
#else
	NOP										//0/0		
#endif 	
	lqd		$36, 80($50)		//1/6			load const vec_uint4 cWriteBackAreas5 = pWriteBackArea[5];
	selb	$31, $31,$61,$51//0/2			const vec_uint4 cSelResults0 = spu_sel(cWriteBackAreas0, cWriteBackSavedAreas0, cWriteBackMasks0);
	lqd		$37, 96($50)		//1/6			load const vec_uint4 cWriteBackAreas6 = pWriteBackArea[6];
	selb	$32, $32,$62,$52//0/2			const vec_uint4 cSelResults1 = spu_sel(cWriteBackAreas1, cWriteBackSavedAreas1, cWriteBackMasks1);
	lqd		$38, 112($50)		//1/6			load const vec_uint4 cWriteBackAreas7 = pWriteBackArea[7];
	selb	$33, $33,$12,$53//0/2			const vec_uint4 cSelResults2 = spu_sel(cWriteBackAreas2, cWriteBackSavedAreas2, cWriteBackMasks2);
	stqd	$31,  0($50)		//1/6			store cSelResults0
	selb	$34, $34,$13,$54//0/2			const vec_uint4 cSelResults3 = spu_sel(cWriteBackAreas3, cWriteBackSavedAreas3, cWriteBackMasks3);
	stqd	$32, 16($50)		//1/6			store cSelResults1
	selb	$35, $35,$18,$55//0/2			const vec_uint4 cSelResults4 = spu_sel(cWriteBackAreas4, cWriteBackSavedAreas4, cWriteBackMasks4);
	stqd	$33, 32($50)		//1/6			store cSelResults2
	selb	$36, $36,$19,$56//0/2			const vec_uint4 cSelResults5 = spu_sel(cWriteBackAreas5, cWriteBackSavedAreas5, cWriteBackMasks5);
	stqd	$34, 48($50)		//1/6			store cSelResults3
	selb	$37, $37,$20,$57//0/2			const vec_uint4 cSelResults6 = spu_sel(cWriteBackAreas6, cWriteBackSavedAreas6, cWriteBackMasks6);
	lqa		$43,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6		load NSPU::NCache::g_CurAtomicEA
	il		$17, 128				//0/2			prepare for mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0)
	stqd	$35, 64($50)		//1/6			store cSelResults4
	selb	$38, $38,$21,$58//0/2			const vec_uint4 cSelResults7 = spu_sel(cWriteBackAreas7, cWriteBackSavedAreas7, cWriteBackMasks7);
	stqd	$36, 80($50)		//1/6			store cSelResults5
	il		$60, 0					//0/2			load 0 for wrch	$ch20
	stqd	$37, 96($50)		//1/6			store cSelResults6
	il		$62, 180				//0/2			load mask for mfc_putllc start command
	stqd	$38,112($50)		//1/6			store cSelResults7
#if !defined(SPU_CACHE_MISS_USE_ASM) && defined(_NO_SPU_ASSERT)
	ai	$sp,$sp,80				//0/2			restore stack pointer		
#else	
	NOP										//0/0
#endif	
	wrch	$ch16,$50				//1/6			prepare for mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0)
	//128 as cache size and the upper 32 bits of the EA do not need to be written
	wrch	$ch18,$43				//1/6			prepare for mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0)
//	wrch	$ch20,$60				//1/6			prepare for mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0)
	wrch	$ch21,$62				//1/6			start mfc_putllc(pWriteBackArea, g_CurAtomicEA, 0, 0)
	stqa	$25, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6			store updated NCache::g_PrefetchDir	
	stqa	$26, _ZN4NSPU6NCache16g_PrefetchLRUDirE	//1/6			store updated NCache::g_PrefetchLRUDir
#if !defined(_NO_SPU_ASSERT)
	//implement: if(GetStackAddress() <= NSPU::g_sProgramTopLS + STACK_WARNING_VAL) StackAssertFunc();
#if defined(SPU_CACHE_MISS_USE_ASM)	
	NOP										//0/0	
	stqd	$81,-48($sp)		//1/6			save register 81 as required by ABI
	NOP										//0/0	
	stqd	$80,-16($sp)		//1/6			save register 81 as required by ABI
	NOP										//0/0	
	stqd	$sp,-80($sp)		//1/6			store stack pointer for StackAssertFunc
	ai	$sp,$sp,-80				//0/2			decrement stack for StackAssertFunc
	lnop									//1/0				
#endif		
	ori $80, $3, 0				//0/2			save return value
	lqa $4, _ZN4NSPU15g_sProgramTopLSE	//1/6		load NSPU::g_sProgramTopLS
	ori $81, $lr, 0				//0/2			save link register
	lnop									//1/0		
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)		
	lnop									//1/0			
	ai $4, $4, STACK_WARNING_VAL		//0/2		NSPU::g_sProgramTopLS + STACK_WARNING_VAL
	lnop									//1/0
	cgt $4, $4, $sp				//0/2			NSPU::g_sProgramTopLS + STACK_WARNING_VAL >= GetStackAddress()
	brz $4, .NoStackFailed1//0/2			if(NSPU::g_sProgramTopLS + STACK_WARNING_VAL < GetStackAddress()) do not call StackAssertFunc()
	NOP										//0/0	
	brsl	$lr, _Z15StackAssertFuncv	//1/4  //StackAssertFunc()
#if defined(SUPP_SN)
	lqa	$54,16							//1/6		load contents of upper 16 bytes of SPUDriver image spu_mod_hdr
	cwd	$55,12($sp)					//1/4		generate control word for insertion into spu_mod_hdr->pad
	shufb	$54,$lr,$54,$55		//1/4		insert branch target
	stqa	$54,16						//1/6		store updated spu_mod_hdr
	NOP											//0/0										
	lqa	$54,16							//1/6		make sure it has completed storage before stop
	ori	$3, $80, 0				//0/2			restore return value
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
	ori	$lr, $81, 0				//0/2			restore link register
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI	
	ai	$sp,$sp,80				//0/2			restore stack pointer
	STOPD										//1/4		custom snPause()
	bi	$lr									//1/4		branch to link register cont.
#endif
.NoStackFailed1:		
	ori	$3, $80, 0				//0/2			restore return value
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
	ori	$lr, $81, 0				//0/2			restore link register
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI	
	ai	$sp,$sp,80				//0/2			restore stack pointer
#else	
	ai	$75,$75,1					//0/2			++NCache::g_LRUCounter; (all word slots)	
#endif //_NO_SPU_ASSERT
.MissHandlerAtomicWriteBackReturn:
	bi	$lr								//1/4			return pRet;
	.size	DoLookupCache, .-DoLookupCache

//-----------------------------------------FlushCacheComplete-------------------------------------------------

#if defined(SPU_CACHE_MISS_USE_ASM)
	//implement void FlushCacheComplete(const int cDoSync)
.text
	.align 7
	.global	FlushCacheComplete
	.type	FlushCacheComplete, @function
FlushCacheComplete:
	ori		$6,$76,	0				//0/2		&g_pSPUCacheDir[0]
	lqa		$8,_ZN4NSPU6NCache17g_pSPUShadowCacheE	//1/6		load &g_pSPUShadowCache[0]
	andi	$75, $75, 0			//0/2		reset LRU counter to avoid wrap arround
	lqa		$4,_ZN4NSPU6NCache12g_SPUNumSetsE	//1/6		load g_SPUNumSets
	il		$9,	6						//0/2		load g_scDMAOutputTag
	lqd		$27,0($76)			//1/6		load g_pSPUCacheDir[0]
	il		$14,0						//0/2		cacheEntry = 0
	lqa		$18,_ZN4NSPU6NCache18g_AsyncRangesDirToE	//1/6		load g_AsyncRangesDirFrom
	NOP
	stqa	$75, _ZN4NSPU6NCache16g_PrefetchLRUDirE		//1/6		NCache::g_PrefetchLRUDir	= spu_splats((uint32)0)	
	ila		$29,.FlushCacheLoop				//0/2		load branch points
	lqa		$66,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6			load g_CurAtomicEA
	il		$11,0						//0/2		cacheEntry = 0
	lqa		$19,_ZN4NSPU6NCache20g_AsyncRangesDirFromE	//1/6		load g_AsyncRangesDirTo
	ila		$10,66051				//0/2		load shuffle mask for spu_splats
	stqa	$75, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6		NCache::g_PrefetchDir	= spu_splats((uint32)0)
	il		$5, 128					//0/2		gen scSPUCacheLineSize for DMA setup
	fsmbi $35, 0 					//1/4		gen 0 to nullify g_CurAtomicEA at the end
	il    $21, 0					//0/2		keep track if there is any asynchronous transfer to be syncd	
	rotqbyi	$12, $27, 0		//1/4		g_pSPUCacheDir[0]
	shli	$4,$4,2					//0/4		const int cSPUCacheEntries = (g_SPUNumSets << scSPUCacheSetNumWaysShift)
	rotqbyi $23,$lr,0			//1/4		save link register
	ila		$30,.FlushCacheLoopExited	//0/2		load branch points	
	wrch	$ch19,$5				//1/6		si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize))
	andi	$7,$7, 0 				//0/2		i =0
	wrch	$ch20,$9				//1/6		si_wrch(MFC_TagID,si_from_uint(g_scDMAOutputTag))
.FlushCacheLoop:				//			do{
	ai		$11,$11,4				//0/2		++index(4 -> rotation mask for accessing g_pSPUCacheDir[set][index])
	brz		$12,.CacheLineUnchanged	//1/4		if (cEA == 0) goto .CacheLineUnchanged
	a			$16,$14,$77			//0/2		cpCurLine	= &g_pSPUCache[cacheEntry]
	lqx		$50,$14,$8			//1/6		const vec_uint4 cShadowLine0 = cpShadowLine[0]
	a			$15,$14,$8			//0/2		const vec_uint4* const __restrict cpShadowLine = &g_pSPUShadowCache[cacheEntry]
	lqx		$40,$14,$77			//1/6		const vec_uint4 cCurLine0 = cpCurLine[0]
//	NOP										//0/0
	lqd		$41,16($16)			//1/6		const vec_uint4 cCurLine1 = cpCurLine[1]
//	NOP										//0/0
	lqd		$51,16($15)			//1/6		const vec_uint4 cShadowLine1 = cpShadowLine[1]
//	NOP										//0/0
	shufb	$13,$12,$12,$10	//1/4		const vec_uint4 cEASplat4 = spu_splats(cEA)
//	NOP										//0/0
	lqd		$42,32($16)			//1/6		const vec_uint4 cCurLine2 = cpCurLine[2]
	NOP										//0/0
	lqd		$52,32($15)			//1/6		const vec_uint4 cShadowLine2 = cpShadowLine[2]
	xor		$50,$40,$50			//0/2		spu_xor(cCurLine0, cShadowLine0)
	lqd		$43,48($16)			//1/6		const vec_uint4 cCurLine3 = cpCurLine[3]
	cgt		$61,$19,$13			//0/2		const vec_uint4 cFromCmpRes = spu_cmpgt(g_AsyncRangesDirFrom, cEASplat)
	lqd		$53,48($15)			//1/6		const vec_uint4 cShadowLine3 = cpShadowLine[3]
	xor		$51,$41,$51			//0/2		spu_xor(cCurLine1, cShadowLine1)
	lqd		$44,64($16)			//1/6		const vec_uint4 cCurLine4 = cpCurLine[4]
	cgt		$60,$18,$13			//0/2		const vec_uint4 cToCmpRes = spu_cmpgt(g_AsyncRangesDirTo, cEASplat)
	lqd		$54,64($15)			//1/6		const vec_uint4 cShadowLine4 = cpShadowLine[4]
	or		$39,$50,$51			//0/2		vec_uint4 diffVec = spu_or(pWriteBackMask[0], pWriteBackMask[1])
	lqd		$45,80($16)			//1/6		const vec_uint4 cCurLine5 = cpCurLine[5]
	xor		$52,$42,$52			//0/2		spu_xor(cCurLine2, cShadowLine2)
	lqd		$55,80($15)			//1/6		const vec_uint4 cShadowLine5 = cpShadowLine[5]
	andc	$60,$60,$61			//0/2		const vec_uint4 cFinalCmpRes = spu_andc(cToCmpRes, cFromCmpRes)
	lqd		$46,96($16)			//1/6		const vec_uint4 cCurLine6 = cpCurLine[6]
	xor		$53,$43,$53			//0/2		spu_xor(cCurLine3, cShadowLine3)
	lqd		$56,96($15)			//1/6		const vec_uint4 cShadowLine6 = cpShadowLine[6]
	or		$39,$39,$52			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[2])
	lqd		$47,112($16)		//1/6		const vec_uint4 cCurLine7 = cpCurLine[7]
	xor		$54,$44,$54			//0/2		spu_xor(cCurLine4, cShadowLine4)
	lqd		$57,112($15)		//1/6		const vec_uint4 cShadowLine7 = cpShadowLine[7]
	or		$39,$39,$53			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[3])
	stqa	$50,261760			//1/6		store pWriteBackMask[0]
	xor		$55,$45,$55			//0/2		spu_xor(cCurLine5, cShadowLine5)
	stqa	$51,261776			//1/6		store pWriteBackMask[1]
	or		$39,$39,$54			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[4])
	stqa	$52,261792			//1/6		store pWriteBackMask[2]
	xor		$56,$46,$56			//0/2		spu_xor(cCurLine6, cShadowLine6)
	stqa	$53,261808			//1/6		store pWriteBackMask[3]
	or		$39,$39,$55			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[5])
	stqa	$54,261824			//1/6		store pWriteBackMask[4]
	xor		$57,$47,$57			//0/2		spu_xor(cCurLine7, cShadowLine7)
	stqa	$55,261840			//1/6		store pWriteBackMask[5]
	or		$39,$39,$56			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[6])
	stqa	$56,261856			//1/6		store pWriteBackMask[6]
	NOP										//0/0
	stqa	$57,261872			//1/6		store pWriteBackMask[7]
	or		$39,$39,$57			//0/2		diffVec = spu_or(diffVec, pWriteBackMask[7])
	orx		$60,$60					//1/4		spu_orx(cFinalCmpRes)
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
	orx		$39,$39					//1/4		spu_orx(diffVec)
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
	brnz	$39, .CacheLineChanged	//1/4		IF(!cIsCacheLineUnChanged, false)
.CacheLineUnchanged:	
	andi	$26,$11,16			//0/2		if (index == 16) ++set
	hbra .FlushCacheLoopAgain1, .FlushCacheLoop		//1/10	branch hint for next loop iteration
	ai		$7,$7,1					//0/2		++i
//	lnop									//1/0
//	NOP										//0/0
	lqx		$27,$26,$6			//1/6		load g_pSPUCacheDir[set]	
	sf		$31,$29,$30			//0/2		.FlushCacheLoopExited - .FlushCacheLoop
	lnop									//1/0
	ceq		$28,$7,$4				//0/2		i == cSPUCacheEntries	
	lnop									//1/0
	andi	$11,$11,15			//0/2		index = (index + 1)	& scSPUCacheSetNumWaysMask (in rot. offset)
//	lnop									//1/0
//	NOP										//0/0
	lnop									//1/0
	and		$31,$31,$28			//0/2		branch diff masked together with branch condition mask	
	lnop									//1/0
	ai		$14,$14,128			//0/2		cacheEntry += scSPUCacheLineSize
	rotqby $12,$27,$11		//1/4		const uint32 cEA = spu_extract(g_pSPUCacheDir[cSet], cIndexInSet)	
	a			$31,$29,$31			//0/2		comp.branch target:	.FlushCacheLoop + mask & (.FlushCacheLoopExited - .FlushCacheLoop)
//	lnop									//1/0
//	NOP										//0/0
//	lnop									//1/0
	a			$6,$6,$26				//0/2		&g_pSPUCacheDir[set]
.FlushCacheLoopAgain1:	
	bi $31								//1/4		WHILE(i < cSPUCacheEntries, true)

.align 4	
.CacheLineChanged:	
#if defined(SUPP_DABR)
	lqa $26,_ZN4NSPU7NDriver7g_sDABRE		//1/6		load g_sDABR
	rotqbyi $26, $26, 8		//1/4		rotate g_sDABR.ppuEA into preferred slot
	ceq $26, $26, $12			//0/2		g_sDABR.ppuEA == cEA
	brz $26,.DABRContFlush//1/4		if(g_sDABR.ppuEA == cEA) break
	stop 255							//0/0		snPause()
	lnop									//1/0
.DABRContFlush:
#endif
	andi	$26,$11,16			//0/2		if (index == 16) ++set
	brnz	$60,.TransferAsync			//1/4		IF(cTransferAsync, false)
	ila		$24,261888			//0/2		gen the LS address for the atomic transfer (g_scWriteBackArea)
	brz		$66,.NoCurSyncTransfer	//1/4		if(g_CurAtomicEA != 0)
.FlushCacheCheckSync:	
	ila		$65, .FlushCacheCheckSync		//0/2	load return address for .FlushCacheLineSyncAgain
	rdch	$2,$ch27				//1/6		int status = mfc_read_atomic_status()
	NOP										//0/0
	brnz  $2, .FlushCacheLineSyncAgain	//1/4		IF(status != 0, false), STALLS: 5
.NoCurSyncTransfer:	
	il		$45,208					//0/2
	wrch	$ch16,$24				//1/6		si_wrch(MFC_LSA,si_from_ptr(g_scWriteBackArea))
	ori		$66, $12, 0			//0/2		NCache::g_CurAtomicEA = cEA
	wrch	$ch18,$12				//1/0		si_wrch(MFC_EAL,si_from_uint(mfc_ea2l(cEA)))
	NOP										//0/0
	wrch	$ch21,$45				//1/6		si_wrch(MFC_Cmd, si_from_uint(208))
	ila		$65, .SetupFlushCacheLoopAgain	//0/2	gen the link register for .JumpToStartAtomicWriteBack
	hbra	.JumpToStartAtomicWriteBack, .StartAtomicWriteBack			//branch hint for call to StartAtomicWriteBack
	//implements CopyCacheLine((vec_uint4*)g_scWriteBackMaskAtomic, (vec_uint4*)g_scWriteBackMask)	
	lqa		$40,261760			//1/6		
	lqa		$41,261776			//1/6		
	lqa		$42,261792			//1/6		
	lqa		$43,261808			//1/6		
	lqa		$44,261824			//1/6		
	lqa		$45,261840			//1/6		
	lqa		$46,261856			//1/6		
	lqa		$47,261872			//1/6		
	stqa	$40,261632			//1/6		
	stqa	$41,261648			//1/6		
	stqa	$42,261664			//1/6		
	stqa	$43,261680			//1/6		
	stqa	$44,261696			//1/6		
	stqa	$45,261712			//1/6		
	stqa	$46,261728			//1/6		
	stqa	$47,261744			//1/6		
	//implements CopyCacheLine(g_scWriteBackSavedArea, &g_pSPUCache[cCacheEntry])	
	lqd		$40,0($16)			//1/6		
	lqd		$41,16($16)			//1/6		
	lqd		$42,32($16)			//1/6		
	lqd		$43,48($16)			//1/6		
	lqd		$44,64($16)			//1/6		
	lqd		$45,80($16)			//1/6		
	lqd		$46,96($16)			//1/6		
	lqd		$47,112($16)		//1/6		
	stqa	$40,262016			//1/6		
	stqa	$41,262032			//1/6		
	stqa	$42,262048			//1/6		
	stqa	$43,262064			//1/6		
	stqa	$44,262080			//1/6		
	stqa	$45,262096			//1/6		
	stqa	$46,262112			//1/6		
	stqa	$47,262128			//1/6
	NOP										//0/0
.JumpToStartAtomicWriteBack:	
	br	 .StartAtomicWriteBack		//1/4		call .StartAtomicWriteBack (defined in SPUCacheMissHandler_spu.S, $lr already generated)
.TransferAsync:	
	il		$60,32					//0/2		gen MFC_PUT_CMD for DMA transfer
	wrch	$ch16,$16				//1/6		si_wrch(MFC_LSA,si_from_ptr(&g_pSPUCache[cacheEntry]))
	ori		$21, $21, 1			//0/2		track that there is a asynchronous transfer to be syncd at the end
	wrch	$ch18,$12				//1/6		si_wrch(MFC_EAL,si_from_uint(cEA))
	NOP										//0/0
	wrch	$ch21,$60				//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD))
.SetupFlushCacheLoopAgain:
	NOP										//0/0
	hbra .FlushCacheLoopAgain1, .FlushCacheLoop		//1/10	branch hint for next loop iteration
	andi	$11,$11,15			//0/2		index = (index + 1)	& scSPUCacheSetNumWaysMask (in rot. offset)
	lqx		$27,$6,$26			//1/6		load g_pSPUCacheDir[set]
	ai		$14,$14,128			//0/2		cacheEntry += scSPUCacheLineSize
	lnop									//1/0
	ai		$7,$7,1					//0/2		++i
//	lnop									//1/0
	a			$6,$6,$26				//0/2		&g_pSPUCacheDir[set]
//	lnop									//1/0
	ceq		$28,$7,$4				//0/2		i == cSPUCacheEntries
//	lnop									//1/0
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
//	lnop									//1/0
//	NOP										//0/0
	rotqby $12,$27,$11		//1/4		const uint32 cEA = spu_extract(g_pSPUCacheDir[cSet], cIndexInSet)	
	NOP										//0/0
.FlushCacheLoopAgain:	
	brz		$28, .FlushCacheLoop	//1/4		WHILE(i < cSPUCacheEntries, true)

.FlushCacheLoopExited:		
	//loop has exited here
	and		$21,$21,$3			//0/2		check if any transfer is to be syncd
	hbr		.RetFlushCache,$23//1/10	branch hint for instructions past function call for non miss case	
	NOP										//0/0
	brz		$66,.NoFinalSyncTransfer	//1/4		if(g_CurAtomicEA != 0)
.FlushCacheFinalCheckSync:	
	ila		$65, .FlushCacheFinalCheckSync		//0/2	load return address for .FlushCacheLineSyncAgain
	rdch	$2,$ch27				//1/6		int status = mfc_read_atomic_status()
	NOP										//0/0
	brnz  $2, .FlushCacheLineSyncAgain	//1/4		IF(status != 0, false), STALLS: 5
.NoFinalSyncTransfer:	
	il		$34,64					//0/2		gen 1<<g_scDMAOutputTag
	brnz	$21, .RetFlushCacheSyncASyncTransfer		//1/4		IF(cSyncTransfer, false)
	NOP										//0/0
	stqa	$35,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6			g_CurAtomicEA=0
	ori		$lr,$23,0				//0/2		restore link register
.RetFlushCache:
	bi		$23							//1/4		return

.RetFlushCacheSyncASyncTransfer:
	il		$41,2						//0/2		gen MFC_TAG_UPDATE_ALL
	hbr		.RetFlushCacheSyncASyncTransferRet, $23	
//	NOP										//0/0	
	wrch	$ch22,$34				//1/6		spu_writech(MFC_WrTagMask, 1<<g_scDMAOutputTag)
//	NOP										//0/0	
	wrch	$ch23,$41				//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL)
	NOP										//0/0	
	stqa	$35,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6			g_CurAtomicEA=0
	ori		$lr,$23,0				//0/2		restore link register
	rdch	$2,$ch24				//1/6		spu_readch(MFC_RdTagStat)	
	NOP										//0/0	
.RetFlushCacheSyncASyncTransferRet:	
	bi		$23							//1/4		return

.FlushCacheLineSyncAgain:
	il		$45,208					//0/2
	wrch	$ch16,$24				//1/6		mfc_prep
	wrch	$ch18,$66				//1/6		mfc_prep
	wrch	$ch21,$45				//1/6		mfc_getllar_again
	NOP										//0/0
	br		.StartAtomicWriteBack	//1/4		call StartAtomicWriteBack

	.size	FlushCacheComplete, .-FlushCacheComplete

#endif //SPU_CACHE_MISS_USE_ASM

#endif //PS3
