#if defined(PS3)

//asm implementation of 
//		const int SPUCacheMissHandler
//		(
//			const uint32 cEA, 
//			const uint16 cSet,
//			const vec_uint4 cPrefetchCntlzRes,
//			const uint32 cReplIndex,
//			const vec_uint4 cPrefAsyncGatherRes,
//			const int32 cPrefOff
//		)
//
//	$ 3: cEA	(input)	-> lineStartOff (output)
//	$ 4: cSet																											(input)
//	$ 5: cPrefetchCntlzRes																				(input)
//	$ 6: cReplIndex																								(input)
//	$ 7: cPrefAsyncGatherRes																			(input)
//	$ 8: 66051 (splat mask for uint32)														(input)
//	$ 9: cPrefOff																									(input)
//	$10: nextEA4																									(input)
//	$11: spu_extract(cPrefetchCntlzRes, 0) == 32									(input)
//	$21: const uint32 cEAAligned = cEA & ~scSPUCacheLineSizeMask  (input)
//	$22: &g_sSPUCacheMem + cEAOff		(must be kept)							  (output)
//	$23: g_PrefetchLRUDir																					(input)
//	$24: g_PrefetchDir																						(input)
//	$25: g_pSPUShadowCache																				(input)
//	$26: g_AsyncRangesDirFrom																			(input)
//	$66: g_CurAtomicEA, keep and store at the end									(input)
//	$68: g_PrefetchDirReg for post miss handler code							(output)
//	$79: g_CurWrittenEA for post miss handler code								(output)
//
//	link register must be preserved
//		
//	use $2 and $40..$65 as working registers
//
//	$65 also acts as link register for .StartAtomicWriteBack
//
//	starts in pipeline 1
//


	// DoPrefetchLookup: 
	//	ret value:			$13
	//	pCacheSetmask		$14
	//	pReplIndex			$15
	//	rNextEA4				$10
	//	rReplMask				$16
	//
	lqa	$25,_ZN4NSPU6NCache17g_pSPUShadowCacheE		//1/6		preload g_pSPUShadowCache for asm cache miss handler	
	ceq	$12, $66, $21			//0/2		(cEAAligned == g_CurAtomicEA)	
	brz	$11,.PrefetchHit	//1/4		if(spu_extract(cPrefetchCntlzRes, 0) == 32)	
	clz $43,	$7						//0/2		const uint32 cIndex	= spu_extract(spu_cntlz(cPrefAsyncGatherRes), 0)
	rotqbyi	$39, $23, 4		//1/4		spu_rlqwbyte(g_PrefetchLRUDir, 4) for GetReplIndex
#if defined(_DEBUG)
	ila	$2,_ZN4NSPU7NDriver11g_sLSBufferE		//0/2		load g_scPreWriteArea
#else	
	ila	$2,261504					//0/2		load g_scPreWriteArea
#endif//_DEBUG
	brnz $12, .SyncAtomicWriteBackPref	//1/4	if(cEAAligned != g_CurAtomicEA) branch ($12 == 0 means !=)
	il	$50,1							//0/2		gen 1
	brnz $7, .SyncAsyncTransferPref	//1/4		IF(cIndex != 32, false) branch	(gb result used here)
.SyncAsyncTransferPrefBack:
	il	$43,31						//0/2		gen transfer tag
	wrch $ch16,$2					//1/6		si_wrch(MFC_LSA,si_from_ptr(g_scPreWriteArea))
	il	$44,64						//0/2		gen MFC_GET_CMD
	wrch $ch18,$21				//1/6		si_wrch(MFC_EAL,si_from_uint(cEAAligned))
	il	$46,6							//0/2		gen offset of transDecrEnd, only used for bubbles
	wrch $ch20,$43				//1/6		si_wrch(MFC_TagID,si_from_uint(31))
	il $37, 3							//0/2		preload 3 for GetReplIndex(g_PrefetchLRUDir)	
	wrch	$ch21,$44				//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD))
#if !defined(_DEBUG)
	ila	$42,_ZN4NSPU17g_SPUBubbleStatesE+2	//0/2		load &g_SPUBubbleStates[0].curState
	rdch $55,$ch8					//1/20	spu_readch(SPU_RdDec)
	il	$44,256						//0/2		int curTagMask = 256
	lqa	$52,_ZN4NSPU17g_SPUBubbleStatesE	//1/6		load g_SPUBubbleStates[0] for access of transDecrEnd		
	ai	$45,$42,64				//0/2		&g_SPUBubbleStates[scMaxSPUBubbleCount]
	lqd	$49,0($42)				//1/6		load g_SPUBubbleStates[i] for access of curState
.BubbleLoopStart:	
	ai	$2,$42,14					//0/2		gen rotation amount of curState
	hbra .BubbleLoopEnd,.BubbleLoopStart	//branch hint for cont.of while (backward likelyness)
	ori	$51,$44,0					//0/2		save curTagMask
	rotqby	$2,$49,$2			//1/4		rotate g_SPUBubbleStates[i].curState into pref. slot
	il $37, 3							//0/2		preload 3 for GetReplIndex(g_PrefetchLRUDir)	
	rotqbyi	$50,$52,8			//1/4		rotate g_SPUBubbleStates[i].transDecrEnd into pref. slot
	il	$47,2							//0/2		gen MFC_TAG_UPDATE_ALL / BUB_STATE_READY
	chd	$53,0($42)				//1/4		gen mask for inserting g_SPUBubbleStates[i].curState
	NOP										//0/0
	lqd	$52,0($42)				//1/6		load g_SPUBubbleStates[i] for access of transDecrEnd	
	ceqhi	$2,$2,1					//0/2		(g_SPUBubbleStates[i].curState == 1)
	lnop									//1/0			
	clgt	$50,$55,$50			//0/2		(cDecrValTransCmp <= g_SPUBubbleStates[i].transDecrEnd)
	rotqbyi $60, $42, 0		//1/4		save current &g_SPUBubbleStates[i]
	nor	$2,$2,$2					//0/2		!(g_SPUBubbleStates[i].curState == 1)
	lnop									//1/0			
	ai		$42,$42,16			//0/2		++i
	lnop									//1/0
	or	$2,$2,$50					//0/2
	lnop									//1/0
	a			$44,$44,$44			//0/2		curTagMask <<= 1
	lnop									//1/0
	ceq	$61,$42,$45				//0/2		++i < scMaxSPUBubbleCount (replaced by address cmp. with last address)
	brhz $2,.BubbleSync		//1/4		if(cBubCheck)
	NOP										//0/0
	lqd	$49,0($42)				//1/6		load g_SPUBubbleStates[i] for access of curState	
	NOP										//0/0
.BubbleLoopEnd:
	brz	$61,.BubbleLoopStart//1/4	while(++i < scMaxSPUBubbleCount)
	NOP										//0/0
	br	.PrefetchMissSetup//1/4	continue with post prefetch miss code
.BubbleSync:
	NOP										//0/0
	shufb	$2,$47,$49,$53	//1/4		g_SPUBubbleStates[i].curState = 2
	NOP										//0/0
	wrch	$ch22,$51				//1/6		spu_writech(MFC_WrTagMask, curTagMask)
	NOP										//0/0
	wrch	$ch23,$47				//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL)
	ceq		$61,$42,$45			//0/2		++i < scMaxSPUBubbleCount (replaced by address cmp. with last address)
	lqd		$49,0($42)			//1/6		load g_SPUBubbleStates[i] for access of curState	
	NOP										//0/0						
	stqd	$2,0($60)				//1/6		store g_SPUBubbleStates[i]
	NOP										//0/0						
	rdch	$2,$ch24				//1/6		spu_readch(MFC_RdTagStat)
	NOP										//0/0					
	brz $61,.BubbleLoopStart//1/4		branch to loop condition test
#endif	//_DEBUG

//precalculate some stuff for FlushSingleCacheLine in both pathes (pref.hit/miss)
.PrefetchMissSetup:
	//common code for prefetch misses
	shli	$35,$4,4				//0/4		cSet * sizeof(vec_uint4)
	rotqbyi	$48, $23, 8		//1/4		const vec_uint4 cVal2	= spu_rlqwbyte(cVal4, 8)
	shli	$34,$4,2				//0/4		cSet << scSPUCacheSetNumWaysShift
	rotqbyi	$41, $23, 12	//1/4		const vec_uint4 cVal3	= spu_rlqwbyte(cVal4, 12)
	clgt $42,$23,$39			//0/2		const vec_uint4 cCmpVec01	= spu_cmpgt(cVal0, cVal1)
	hbra .ContToPrefetchPostSetup, .PrefetchPostSetup	//1/10	branch hint for cont. to common post pref. code
	il $38, 2							//0/2		preload 2 for GetReplIndex(g_PrefetchLRUDir)
	lqa $36, _ZN4NSPU6NCache18g_AsyncRangesDirToE		//1/6	 load g_AsyncRangesDirTo
	selb $44,$23,$48,$42	//0/2		const vec_uint4 cCmpSelRes01 = spu_sel(cVal0, cVal1, cCmpVec01)
	fsmbi $79, 0 					//1/4		g_CurWrittenEA = (vec_uint4)0
	clgt $43,$48,$41			//0/2		const vec_uint4 cCmpVec23	= spu_cmpgt(cVal2, cVal3), 1 STALL
	lnop									//1/0	
	andi $42,$42,1				//0/2		const vec_uint4 cCmpIndexRes01 = spu_sel((vec_uint4)0, (vec_uint4)1, cCmpVec01)//spu_and(cCmpVec01, 1)
	lqx	$33,$35,$76				//1/6		g_pSPUCacheDir[cSet]
	selb $45,$48,$41,$43	//0/2		const vec_uint4 cCmpSelRes23 = spu_sel(cVal2, cVal3, cCmpVec23)
	lqx	$18,$35,$73				//1/6		g_pSPUCacheLRUCtrl[cSet]
	selb $43,$38, $37,$43 //0/2		const vec_uint4 cCmpIndexRes23 = spu_sel((vec_uint4)2, (vec_uint4)3, cCmpVec23)
//	lnop									//1/0					
	clgt $46, $44, $45		//0/2		const vec_uint4 cCmpVec0123	= spu_cmpgt(cCmpSelRes01, cCmpSelRes23)
//	lnop									//1/0						
	a	$34,$6,$34					//0/2		(cSet << scSPUCacheSetNumWaysShift) + cIndexInSet)
//	lnop									//1/0						
	selb $15,$42,$43,$46	//0/2		*pReplIndex = GetReplIndex(g_PrefetchLRUDir) (= spu_sel(cCmpIndexRes01, cCmpIndexRes23, cCmpVec0123))
//	lnop									//1/0						
	il $42, -1						//0/2		gen 0xFFFFFFFF
	lqa	$24,_ZN4NSPU6NCache13g_PrefetchDirE				//1/6		preload g_PrefetchDir for asm cache miss handler
	shli $48,$15,2				//0/4		sizeof(uint32) * *pReplIndex for insertion mask
	lqa	$30,_ZN4NSPU6NCache16g_CurSPUAsyncTagE			//1/6		load g_CurSPUAsyncTag
	shli $34,$34,7				//0/4		sizeof(vec_uint4) * ((cSet << scSPUCacheSetNumWaysShift) + cIndexInSet) << (scSPUCacheLineSizeShift-4)
//	lnop									//1/0						
	ilhu $14,-32768				//0/2	  *pCacheSetmask = (1 << 31)	
//	lnop									//1/0						
	il $41, 0							//0/2		gen 0 for spu_insert
	lnop									//1/0		
#if defined(_DEBUG)
	ila	$13,_ZN4NSPU7NDriver11g_sLSBufferE		//0/2		pRetValue = (vec_uint4*)g_scPreWriteArea
#else	
	ila	$13,261504				//0/2		pRetValue = (vec_uint4*)g_scPreWriteArea
#endif	
	cwx	$48,$sp,$48				//1/4		gen insertion mask for spu_insert
	a	$10,$9,$10					//0/2		rNextEA4 = spu_add(rNextEA4, cPrefOff)
	lqx	$40,$34,$77				//1/6		const vec_uint4 cCurLine0 = cpCurLine[0]
	a	$32,$34,$77					//0/2		cpCurLine	= &g_pSPUCache[cCacheEntry]
//	lnop									//1/0						
//	NOP										//0/0
	lnop									//1/0						
	andi $10,$10,-128			//0/2		rNextEA4 = spu_and(rNextEA4, ~scSPUCacheLineSizeMask)
	shufb	$16,$42,$41,$48	//1/4		rReplMask = spu_insert(0xFFFFFFFF, (const vec_uint4)0, *pReplIndex)
	NOP										//0/0
.ContToPrefetchPostSetup:
	br .PrefetchPostSetup	//1/4		continue with common prefetch code
	
.PrefetchHit:
	//we have a prefetch hit
	ai $15,$5,-28					//0/2		*pReplIndex = const uint32 cIndex = spu_extract(spu_sub(cPrefetchCntlzRes, (vec_uint4)28), 0)
	lqa $36, _ZN4NSPU6NCache18g_AsyncRangesDirToE		//1/6	 load g_AsyncRangesDirTo
	shli	$34,$4,2				//0/4		cSet << scSPUCacheSetNumWaysShift
	fsmbi $79, 0 					//1/4		g_CurWrittenEA = (vec_uint4)0
	shli $43,$15,2				//0/4		sizeof(uint32) * *pReplIndex for insertion mask
	fsmbi $45, 0					//1/4		gen 0 for spu_insert	
	sfi	$41,$15,31				//0/2		(31 - cIndex)
	lnop									//1/0		
	shli $47,$15,7				//0/4		cIndex << scSPUCacheLineSizeShift
	lqa	$30,_ZN4NSPU6NCache16g_CurSPUAsyncTagE			//1/6		load g_CurSPUAsyncTag
	a	$34,$6,$34					//0/2		(cSet << scSPUCacheSetNumWaysShift) + cIndexInSet)
	fsmbi	$46,65535				//1/4		gen 0xFFFFFFFF for spu_insert
	ila	$42,_ZN4NSPU6NCache17g_pPrefetchBufferE	//0/2		&g_pPrefetchBuffer[0]
	cwx	$44,$sp,$43				//1/4		gen insertion mask for spu_insert
	shli $34,$34,7				//0/4		sizeof(vec_uint4) * ((cSet << scSPUCacheSetNumWaysShift) + cIndexInSet) << (scSPUCacheLineSizeShift-4)
	lqa	$24,_ZN4NSPU6NCache13g_PrefetchDirE				//1/6		preload g_PrefetchDir for asm cache miss handler
	shli	$35,$4,4				//0/4		cSet * sizeof(vec_uint4)
//	lnop									//1/0		
	a $13,$47, $42				//0/2		pRetValue	= (vec_uint4*)&g_pPrefetchBuffer[cIndex << scSPUCacheLineSizeShift]		
//	lnop									//1/0		
	a	$10,$9,$10					//0/2		rNextEA4 = spu_add(rNextEA4, cPrefOff)
	lnop									//1/0		
	il $48, 1							//0/2		gen 1 for mask shifting
	shufb	$16,$46,$45,$44	//1/4		rReplMask = spu_insert(0xFFFFFFFF, (const vec_uint4)0, cIndex)
	andi $10,$10,-128			//0/2		rNextEA4 = spu_and(rNextEA4, ~scSPUCacheLineSizeMask)	
	lqx	$18,$35,$73				//1/6		g_pSPUCacheLRUCtrl[cSet]
	a	$32,$34,$77					//0/2		cpCurLine	= &g_pSPUCache[cCacheEntry]
	lqx	$40,$34,$77				//1/6		const vec_uint4 cCurLine0 = cpCurLine[0]
	shl	$14,$48,$41				//0/4		*pCacheSetmask = (1 << (31 - cIndex))
	lqx	$33,$35,$76				//1/6		g_pSPUCacheDir[cSet]
.PrefetchPostSetup:
	//prepared: $35 = cSet << 4, $34 = cCacheEntry, $33 = g_pSPUCacheDir[cSet], $32 = cpCurLine, $40 = cCurLine0
	//					$31 = g_CurSPUAsyncTag
	//common code for prefetches, here also starts FlushSingleCacheLine
	a	$31,$34,$25					//0/2		const vec_uint4* const __restrict cpShadowLine = &g_pSPUShadowCache[cCacheEntry]
	lqx	$50,$34,$25				//1/6		const vec_uint4 cShadowLine0 = cpShadowLine[0]
	ai $75,$75,-1					//0/2		revert early increment to g_LRUCounter due to LS lookup handling
	lqd	$41,16($32)				//1/6		const vec_uint4 cCurLine1 = cpCurLine[1]
	selb $68,$24,$10,$16	//0/2		g_PrefetchDirReg = spu_sel(g_PrefetchDir, rNextEA4, rReplMask)
	lqd	$51,16($31)				//1/6		const vec_uint4 cShadowLine1 = cpShadowLine[1]
	selb $69,$23,$75,$16	//0/2		g_PrefetchLRUDirReg = spu_sel(g_PrefetchLRUDir, g_LRUCounter, rReplMask)
	lqd	$42,32($32)				//1/6		const vec_uint4 cCurLine2 = cpCurLine[2]	
	NOP										//0/0
	lqd	$52,32($31)				//1/6		const vec_uint4 cShadowLine2 = cpShadowLine[2]
	shli $65,$6,2					//0/4		cIndexInSet * sizeof(uint32)
	lqd	$43,48($32)				//1/6		const vec_uint4 cCurLine3 = cpCurLine[3]	
	xor	$50,$40,$50				//0/2		spu_xor(cCurLine0, cShadowLine0)
	lqd	$53,48($31)				//1/6		const vec_uint4 cShadowLine3 = cpShadowLine[3]
	ila	$29,_ZN4NSPU6NCache19g_SPUAsyncCacheLineE	//0/2	&g_SPUAsyncCacheLine[0]
	lqd	$44,64($32)				//1/6		const vec_uint4 cCurLine4 = cpCurLine[4]
	xor	$51,$41,$51				//0/2		spu_xor(cCurLine1, cShadowLine1)
	lqd	$54,64($31)				//1/6		const vec_uint4 cShadowLine4 = cpShadowLine[4]
	shli $28,$30,7				//0/4		(g_CurSPUAsyncTag << (scSPUCacheLineSizeShift - 4)) * sizeof(vec_uint4)
	lqd	$45,80($32)				//1/6		const vec_uint4 cCurLine5 = cpCurLine[5]
	xor	$52,$42,$52				//0/2		spu_xor(cCurLine2, cShadowLine2)
	lqd	$55,80($31)				//1/6		const vec_uint4 cShadowLine5 = cpShadowLine[5]
	or $48,$50,$51				//0/2		vec_uint4 diffVec = spu_or(pWriteBackMask[0], pWriteBackMask[1])
	lqd	$46,96($32)				//1/6		const vec_uint4 cCurLine6 = cpCurLine[6]
	xor	$53,$43,$53				//0/2		spu_xor(cCurLine3, cShadowLine3)
	lqd	$56,96($31)				//1/6		const vec_uint4 cShadowLine6 = cpShadowLine[6]
	or $48,$48,$52				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[2])
	lqd	$47,112($32)			//1/6		const vec_uint4 cCurLine7 = cpCurLine[7]
	xor	$54,$44,$54				//0/2		spu_xor(cCurLine4, cShadowLine4)
	lqd	$57,112($31)			//1/6		const vec_uint4 cShadowLine7 = cpShadowLine[7]
	or $48,$48,$53				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[3])
	rotqby $64,$33,$65		//1/4		const uint32 cEA = spu_extract(g_pSPUCacheDir[cSet], cIndexInSet)
	xor	$55,$45,$55				//0/2		spu_xor(cCurLine5, cShadowLine5)
	hbra .CacheLineIsUnchanged, .PostFlushSingleCacheLine	//1/10  prefer case that cache line is unchanged
	or $48,$48,$54				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[4])
	stqa $51,261776				//1/6		store pWriteBackMask[1]
	xor	$56,$46,$56				//0/2		spu_xor(cCurLine6, cShadowLine6)
	stqa $52,261792				//1/6		store pWriteBackMask[2]
	or $48,$48,$55				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[5])
	shufb	$63,$64,$64,$8	//1/4		const vec_uint4 cEASplat4 = spu_splats(cEA)
	xor	$57,$47,$57				//0/2		spu_xor(cCurLine7, cShadowLine7)
	stqa $50,261760				//1/6		store pWriteBackMask[0]
	or $48,$48,$56				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[6])
	stqa $53,261808				//1/6		store pWriteBackMask[3]
	a	$27,$28,$29					//0/2		vec_uint4* const __restrict pCurAsyncCacheLine = &g_SPUAsyncCacheLine[g_CurSPUAsyncTag << (scSPUCacheLineSizeShift-4)]
	stqa $56,261856				//1/6		store pWriteBackMask[6]
	or $48,$48,$57				//0/2		diffVec = spu_or(diffVec, pWriteBackMask[7])
	stqa $57,261872				//1/6		store pWriteBackMask[7]
	cgt $61, $26, $63			//0/2		const vec_uint4 cFromCmpRes = spu_cmpgt(g_AsyncRangesDirFrom, cEASplat)
	stqa $55,261840				//1/6		store pWriteBackMask[5]
	cgt $60, $36, $63			//0/2		const vec_uint4 cToCmpRes = spu_cmpgt(g_AsyncRangesDirTo, cEASplat)
	orx $48, $48					//1/4		spu_orx(diffVec)
	ori $26,$65,0					//0/2		save cIndexInSet * sizeof(uint32)
	rotqbyi	$17,$64,-4		//1/4		put cEA into slot expected by mfc_getllar
	ceq	$62,$63,$79				//0/2		spu_cmpeq(cEA4, (vec_uint4)0)
	stqa $54,261824				//1/6		store pWriteBackMask[4]
	andc $60,$60,$61			//0/2		const vec_uint4 cFinalCmpRes = spu_andc(cToCmpRes, cFromCmpRes)
	lqa $12, _ZN4NSPU6NCache13g_SPUAsyncDirE		//1/6		const vec_uint4 cAsyncDirSaved = g_SPUAsyncDir
	andc $48,$48,$62			//0/2		spu_orx(diffVec) != 0 && cEA4 != 0
	cwx	$26, $sp, $26			//1/4		generate insertion mask for spu_insert(..., , cReplIndex)
	xswd $17, $17					//0/2		sign extend 64 bit cEA as expected by mfc_getllar
	orx $60, $60					//1/4		spu_orx(cFinalCmpRes)
	il $70,0							//0/2		$70 = 0 (g_AtomicEAToStartReg = 0)
.CacheLineIsUnchanged:	
	brz $48, .PostFlushSingleCacheLine	//1/4  if(cIsCacheLineUnChanged) return
	ori $79,$63,0					//0/2		g_CurWrittenEA = cEASplat4
//	lnop									//1/0		
	shli $58,$30,2				//0/4		cCurTag * sizeof(uint32)
//	lnop									//1/0
	ori $38, $64, 0				//0/2		save cEA currently written back (overwritten in StartAtomicWriteBack)
	brnz $60, .FlushCacheLineASync		//1/4		if(cTransferAsync) goto asynchronous transfer
.FlushCacheLineSync:
	ila	$39,261888				//0/2			gen the LS address for the atomic transfer (g_scWriteBackArea)
	brz $66,.PostSyncAtomicWriteBackFlushCacheLine		//1/4		IF(g_CurAtomicEA != 0, true)
.FlushCacheLineSyncCheck:
	ila	$42, 261888				//0/2		load g_scWriteBackArea
	rdch	$2,$ch27				//1/6		int status = mfc_read_atomic_status()
	NOP										//0/0		
	brnz	$2, .FlushCacheLineSyncCheckSyncAgain		//1/4		IF(status != 0, false) sync again
.PostSyncAtomicWriteBackFlushCacheLine:
	il $65,208						//0/2		gen start command for mfc_getllar
	wrch $ch16,$39				//1/6		put the LS address (g_scWriteBackArea) into its channel
//	NOP										//0/0		
	wrch $ch18,$38				//1/6		put cEA into its channel
//	NOP										//0/0		
	wrch $ch21,$65				//1/6		start mfc_getllar
	//implements CopyCacheLine((vec_uint4*)g_scWriteBackMaskAtomic, (vec_uint4*)g_scWriteBackMask)
	hbra .FlushSingleCacheLineSyncRet, .PostFlushSingleCacheLine	//1/10	branch hint for return from asynchronous transfer			
	lqa $40, 261760				//1/6		
	lqa $41, 261776				//1/6		
	lqa $42, 261792				//1/6		
	lqa $43, 261808				//1/6		
	lqa $44, 261824				//1/6		
	lqa $45, 261840				//1/6		
	lqa $46, 261856				//1/6		
	lqa $47, 261872				//1/6		
	stqa $40,261632				//1/6		
	stqa $41,261648				//1/6		
	stqa $42,261664				//1/6		
	stqa $43,261680				//1/6		
	stqa $44,261696				//1/6		
	stqa $45,261712				//1/6		
	stqa $46,261728				//1/6		
	stqa $47,261744				//1/6		
	//implements CopyCacheLine(g_scWriteBackSavedArea, &g_pSPUCache[cCacheEntry])	
	lqd $40,0($32)				//1/6		
	lqd $41,16($32)				//1/6		
	lqd $42,32($32)				//1/6		
	lqd $43,48($32)				//1/6		
	lqd $44,64($32)				//1/6		
	lqd $45,80($32)				//1/6		
	lqd $46,96($32)				//1/6		
	lqd $47,112($32)			//1/6		
	stqa $40,262016				//1/6		
	stqa $41,262032				//1/6		
	stqa $42,262048				//1/6		
	stqa $43,262064				//1/6		
	stqa $44,262080				//1/6		
	stqa $45,262096				//1/6		
	stqa $46,262112				//1/6		
	ori $66, $38, 0				//0/2		g_CurAtomicEA	= cEA	
	stqa $47,262128				//1/6		
	il $70,1							//0/2		g_AtomicEAToStartReg = (vec_uint4)1	
.FlushSingleCacheLineSyncRet:
	br .PostFlushSingleCacheLine	//1/4		return from FlushSingleCacheLine
	
.FlushCacheLineASync:
	ai	$48,$30,1					//0/2		(g_CurSPUAsyncTag + 1)
	stqd $40, 0($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
	il	$50,32						//0/2		gen MFC_PUT_CMD for DMA transfer
	stqd $41, 16($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
	andi $48,$48,3				//0/2		g_CurSPUAsyncTag = (g_CurSPUAsyncTag + 1) & 3
	stqd $42, 32($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	stqd $43, 48($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	stqd $44, 64($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	stqd $45, 80($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	stqd $46, 96($27)			//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	cwx	$52,$sp,$58				//1/4		generate controls for insertion
//	NOP										//0/0		
	stqd $47, 112($27)		//1/6		CopyCacheLine(pCurAsyncCacheLine, &g_pSPUCache[cCacheEntry])
//	NOP										//0/0		
	wrch	$ch16,$27				//1/6		si_wrch(MFC_LSA,si_from_ptr(pCurAsyncCacheLine))
//	NOP										//0/0		
	stqa $48,_ZN4NSPU6NCache16g_CurSPUAsyncTagE			//1/6		store new g_CurSPUAsyncTag
//	NOP										//0/0			
	shufb	$58,$64,$12,$52	//1/4		g_SPUAsyncDir	= spu_insert(cEA, g_SPUAsyncDir, cCurTag)
//	NOP										//0/0		
	wrch	$ch20,$30				//1/6		si_wrch(MFC_TagID,si_from_uint(cCurTag))
//	NOP										//0/0			
	wrch	$ch18,$64				//1/6		si_wrch(MFC_EAL,si_from_uint(cEA))
//	NOP										//0/0		
	wrch	$ch21,$50				//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD))
	NOP										//0/0		
	stqa $58, _ZN4NSPU6NCache13g_SPUAsyncDirE		//1/6		store updated g_SPUAsyncDir

.PostFlushSingleCacheLine:
	//prepared: $33 = g_pSPUCacheDir[cSet],  $35 = cSet << 4,  $26 = insertion mask for spu_insert(..., , cReplIndex)
	//					$18 = g_pSPUCacheLRUCtrl[cSet], $34 = cCacheEntry = (cLine4 << 4), $32 = pCacheLine
	//					$31 = &g_pSPUShadowCache[cLine4]
	ceq	$40,$10,$12				//0/2		const vec_uint4 cAsyncCmpRes = spu_cmpeq(nextEA4, cAsyncDirSaved)
	shufb	$33,$21,$33,$26	//1/4		g_pSPUCacheDir[cSet] = spu_insert(cEAAligned, g_pSPUCacheDir[cSet], cReplIndex)
	il $41,2							//0/2		gen MFC_TAG_UPDATE_ALL
	shufb	$18,$72,$18,$26	//1/4		g_pSPUCacheLRUCtrl[cSet] = spu_insert(spu_extract(g_LRUCounterIncr, 0), g_pSPUCacheLRUCtrl[cSet], cReplIndex)
	shli $65,$15,7				//0/4		replIndex << scSPUCacheLineSizeShift
	orx $40, $40					//1/4		const vec_uint4 cAnyHit	= spu_orx(cAsyncCmpRes)
	ori $3,$34,0					//0/2		gen return value (cLine4 << 4)
	wrch	$ch22,$14				//1/6		spu_writech(MFC_WrTagMask, tagMask)
	ila	$64,_ZN4NSPU6NCache17g_pPrefetchBufferE	//0/2		&g_pPrefetchBuffer[0]
	stqx $33,$35,$76			//1/6		store g_pSPUCacheDir[cSet]
	a	$22,$22,$77					//0/2		&g_sSPUCacheMem + cEAOff
	stqx $18,$35,$73			//1/6		store g_pSPUCacheLRUCtrl[cSet]
	a $64, $65, $64				//0/2		&g_pPrefetchBuffer[replIndex << scSPUCacheLineSizeShift]
	shufb	$20,$40,$40,$8	//1/4		spu_splats(spu_extract(cAnyHit, 0))
	sfi	$15,$15,31				//0/2		31 - cReplIndex	
	wrch $ch23,$41				//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL)
	il $19, 0							//0/2		load 0
	stqa $66,_ZN4NSPU6NCache13g_CurAtomicEAE		//1/6		store g_CurAtomicEA
	NOP										//0/0
	rdch $2,$ch24					//1/6		spu_readch(MFC_RdTagStat)
	//now quickly load the contents transferred here to start the next prefetch
	NOP										//0/0
	lqd $40,0($13)				//1/6
	ceqi $20,$20, 0				//0/2		spu_splats(spu_extract(cAnyHit, 0)) == 0 (channel 0 duplicated)
	lqd $41,16($13)				//1/6
	NOP										//0/0
	lqd $42,32($13)				//1/6
	selb $20,$16,$19,$20	//0/2		select the mask so that it becomes replMask if cAnyHit != 0 and 0 otherwise
	lqd $43,48($13)				//1/6
	NOP										//0/0
	lqd $44,64($13)				//1/6
	andc $68,$68,$20			//0/2		g_PrefetchDirReg = spu_sel(g_PrefetchDir, (vec_uint4)0, replMask)
	lqd $45,80($13)				//1/6
	andc $69,$69,$20			//0/2		g_PrefetchLRUDirReg = spu_sel(g_PrefetchLRUDir, (vec_uint4)0, replMask)
	lqd $46,96($13)				//1/6
	NOP										//0/0
	hbr .Return, $lr			//1/10  branch hint for return from DoCacheLookup
	il	$2,128						//0/2		gen cache line size
	wrch $ch16,$64				//1/6		si_wrch(MFC_LSA,si_from_ptr(&g_pPrefetchBuffer[cReplIndex << scSPUCacheLineSizeShift]))
	il $48, 66						//0/2		gen MFC_GETF_CMD
	wrch $ch18,$10				//1/6		si_wrch(MFC_EAL,si_from_uint(spu_extract(cNextEA4, 0)))
//	NOP										//0/0
//	wrch $ch19,$2					//1/6		si_wrch(MFC_Size,si_from_uint(128))
//	NOP										//0/0
	wrch $ch20,$15				//1/6		si_wrch(MFC_TagID,si_from_uint(31 - cReplIndex))
//	NOP										//0/0
	wrch $ch21,$48				//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD))
	//now store loaded contents into the cache and shadow cache line	
	andc $24, $24, $24		//0/2		$24 = 0
	lqd $47,112($13)			//1/6
	ceq	$23,$79,$68				//0/2		const vec_uint4 cPrefCmpRes	= spu_cmpeq(g_CurWrittenEA, g_PrefetchDir);
	stqd $40,0($32)				//1/6
	ila	$50, 261632				//0/2		load g_scWriteBackMaskAtomic = ((vec_uint4*)(256 * 1024 - 4*scSPUCacheLineSize))
	stqd $40,0($31)				//1/6
	selb $25,$68,$24,$23	//0/2		g_PrefetchDir	= spu_sel(g_PrefetchDir, (vec_uint4)0, cPrefCmpRes);
	stqd $41,16($32)			//1/6
	a	$3,$22,$3						//0/2		const unsigned int cLineOff		= &g_sSPUCacheMem + cEAOff + lineStartOff;	
	stqd $41,16($31)			//1/6
	NOP										//0/0
	stqd $42,32($32)			//1/6
	selb $26,$69,$24,$23	//0/2		g_PrefetchLRUDir	= spu_sel(g_PrefetchLRUDir, (vec_uint4)0, cPrefCmpRes);
	stqd $42,32($31)			//1/6
//	NOP										//0/0
	stqd $43,48($32)			//1/6
//	NOP										//0/0
	stqd $43,48($31)			//1/6
//	NOP										//0/0
	stqd $44,64($32)			//1/6
//	NOP										//0/0
	stqd $44,64($31)			//1/6
//	NOP										//0/0
	stqd $45,80($32)			//1/6
//	NOP										//0/0
	stqd $45,80($31)			//1/6
//	NOP										//0/0
	stqd $46,96($32)			//1/6
//	NOP										//0/0
	stqd $46,96($31)			//1/6
//	NOP										//0/0
	stqd $47,112($32)			//1/6
//	NOP										//0/0
	stqd $47,112($31)			//1/6
	NOP										//0/0
	brnz $70, .MissHandlerAtomicWriteBack		//1/4		IF(g_AtomicEAToStart, false) StartAtomicWriteBack();	
	NOP										//0/0	
	stqa	$25, _ZN4NSPU6NCache13g_PrefetchDirE		//1/6		store updated NCache::g_PrefetchDir
	ai	$75,$75,1					//0/2		++NCache::g_LRUCounter; (all word slots)
	stqa	$26, _ZN4NSPU6NCache16g_PrefetchLRUDirE	//1/6		store updated NCache::g_PrefetchLRUDir
#if !defined(_NO_SPU_ASSERT)
	//implement: if(GetStackAddress() <= NSPU::g_sProgramTopLS + STACK_WARNING_VAL) StackAssertFunc();
	ori $17, $80, 0				//0/2			save $80 as required by ABI
	stqd	$81,-48($sp)		//1/6			save register 81 as required by ABI
	NOP										//0/0	
	stqd	$80,-16($sp)		//1/6			save register 81 as required by ABI
	NOP										//0/0	
	stqd	$sp,-80($sp)		//1/6			store stack pointer for StackAssertFunc
	ai	$sp,$sp,-80				//0/2			decrement stack for StackAssertFunc
	lqa $4, _ZN4NSPU15g_sProgramTopLSE	//1/6		load NSPU::g_sProgramTopLS
	ori $80, $3, 0				//0/2			save return value
	lnop									//1/0
	ori $81, $lr, 0				//0/2			save link register
	stqd	$17,64($sp)			//1/6			save register 80 (saved into $17) as required by ABI		
	ai $4, $4, STACK_WARNING_VAL		//0/2		NSPU::g_sProgramTopLS + STACK_WARNING_VAL
	lnop									//1/0
	cgt $4, $4, $sp				//0/2			NSPU::g_sProgramTopLS + STACK_WARNING_VAL >= GetStackAddress()
	brz $4, .NoStackFailed2//0/2			if(NSPU::g_sProgramTopLS + STACK_WARNING_VAL < GetStackAddress()) do not call StackAssertFunc()
	NOP										//0/0	
	brsl	$lr, _Z15StackAssertFuncv	//1/4  //StackAssertFunc()
.NoStackFailed2:		
	ori	$3, $80, 0				//0/2			restore return value
	lqd	$80,64($sp)				//1/6			restore register 80 as required by ABI (stored with -16-> -16+80 = 64)	
	ori	$lr, $81, 0				//0/2			restore link register
	lqd	$81,32($sp)				//1/6			restore register 81 as required by ABI	
	ai	$sp,$sp,80				//0/2			restore stack pointer
#else	
	NOP										//0/0				
#endif //_NO_SPU_ASSERT
.Return:	
	bi	$lr								//1/4			return pRet;
//END OF SPUCacheMissHandler

.FlushCacheLineSyncCheckSyncAgain:
	//should even more rarely happen that the atomic transfer back fails due to interference by the PPU or another SPU
//	il $46,128						//0/2		next instr.set up mfc_getllar(g_scWriteBackArea, g_CurAtomicEA, 0, 0)
//	rotqbyi	$43,$66,-4		//1/4		
//	NOP										//0/0
//	hbra .FlushCacheLineSyncCheckSyncAgainStartAtomicWriteBack, .StartAtomicWriteBack			//branch hint for call to StartAtomicWriteBack
	il $45,208							//0/2
	wrch $ch16,$42					//1/6		
//	xswd $43,$43					//0/2		
//	wrch $ch17,$43				//1/6
//	NOP										//0/0
	wrch $ch18,$66				//1/6		
//	wrch $ch19,$46				//1/6
//	NOP										//0/0
//	wrch $ch20,$44				//1/6
//	NOP										//0/0
	wrch $ch21,$45				//1/6
	ila $65, .FlushCacheLineSyncCheck	//0/2
.FlushCacheLineSyncCheckSyncAgainStartAtomicWriteBack:
	br	 .StartAtomicWriteBack		//1/4		call StartAtomicWriteBack

//imple. of StartAtomicWriteBack, uses all working registers, must be called by brsl
//contains no nops to save instruction memory
.StartAtomicWriteBack:
	lqa	$50,261632				//1/6			const vec_uint4 cWriteBackMasks0 = cpWriteBackMask[0]
	lqa	$51,261648				//1/6			const vec_uint4 cWriteBackMasks1 = cpWriteBackMask[1]
	lqa	$52,261664				//1/6			const vec_uint4 cWriteBackMasks2 = cpWriteBackMask[2]
	lqa	$53,261680				//1/6			const vec_uint4 cWriteBackMasks3 = cpWriteBackMask[3]
	lqa	$54,261696				//1/6			const vec_uint4 cWriteBackMasks4 = cpWriteBackMask[4]
	lqa	$55,261712				//1/6			const vec_uint4 cWriteBackMasks5 = cpWriteBackMask[5]
	lqa	$56,261728				//1/6			const vec_uint4 cWriteBackMasks6 = cpWriteBackMask[6]
	lqa	$57,261744				//1/6			const vec_uint4 cWriteBackMasks7 = cpWriteBackMask[7]
	hbr .RetStartAtomicWriteBack, $65		//1/10		branch hint for return from StartAtomicWriteBack	
	lqa	$58,262016				//1/6			const vec_uint4 cWriteBackSavedAreas0 = cpWriteBackSavedArea[0]
	lqa	$59,262032				//1/6			const vec_uint4 cWriteBackSavedAreas1 = cpWriteBackSavedArea[1]
	lqa	$60,262048				//1/6			const vec_uint4 cWriteBackSavedAreas2 = cpWriteBackSavedArea[2]
	lqa	$61,262064				//1/6			const vec_uint4 cWriteBackSavedAreas3 = cpWriteBackSavedArea[3]
	lqa	$62,262080				//1/6			const vec_uint4 cWriteBackSavedAreas4 = cpWriteBackSavedArea[4]
	lqa	$63,262096				//1/6			const vec_uint4 cWriteBackSavedAreas5 = cpWriteBackSavedArea[5]
	lqa	$64,262112				//1/6			const vec_uint4 cWriteBackSavedAreas6 = cpWriteBackSavedArea[6]
	lqa	$40,262128				//1/6			const vec_uint4 cWriteBackSavedAreas7 = cpWriteBackSavedArea[7]
	rdch $2,$ch27					//1/6			mfc_read_atomic_status()
	lqa	$42,261888				//1/6			const vec_uint4 cWriteBackAreas0 = pWriteBackArea[0]
	lqa	$43,261904				//1/6			const vec_uint4 cWriteBackAreas1 = pWriteBackArea[1]
	lqa	$44,261920				//1/6			const vec_uint4 cWriteBackAreas2 = pWriteBackArea[2]
	lqa	$45,261936				//1/6			const vec_uint4 cWriteBackAreas3 = pWriteBackArea[3]		
	lqa	$46,261952				//1/6			const vec_uint4 cWriteBackAreas4 = pWriteBackArea[4]
	lqa	$47,261968				//1/6			const vec_uint4 cWriteBackAreas5 = pWriteBackArea[5]
	selb $42,$42,$58,$50	//0/2			const vec_uint4 cSelResults0 = spu_sel(cWriteBackAreas0, cWriteBackSavedAreas0, cWriteBackMasks0)
	lqa	$48,261984				//1/6			const vec_uint4 cWriteBackAreas6 = pWriteBackArea[6]
	selb $43,$43,$59,$51	//0/2			const vec_uint4 cSelResults1 = spu_sel(cWriteBackAreas1, cWriteBackSavedAreas1, cWriteBackMasks1)	
	lqa	$49,262000				//1/6			const vec_uint4 cWriteBackAreas6 = pWriteBackArea[7]
	selb $44,$44,$60,$52	//0/2			const vec_uint4 cSelResults2 = spu_sel(cWriteBackAreas2, cWriteBackSavedAreas2, cWriteBackMasks2)	
	stqa $42, 261888			//1/6			store cSelResults0
	selb $45,$45,$61,$53	//0/2			const vec_uint4 cSelResults3 = spu_sel(cWriteBackAreas3, cWriteBackSavedAreas3, cWriteBackMasks3)	
	stqa $43, 261904			//1/6			store cSelResults1
	selb $46,$46,$62,$54	//0/2			const vec_uint4 cSelResults4 = spu_sel(cWriteBackAreas4, cWriteBackSavedAreas4, cWriteBackMasks4)	
	stqa $44, 261920			//1/6			store cSelResults2
	selb $47,$47,$63,$55	//0/2			const vec_uint4 cSelResults5 = spu_sel(cWriteBackAreas5, cWriteBackSavedAreas5, cWriteBackMasks5)	
	stqa $45, 261936			//1/6			store cSelResults3
	selb $48,$48,$64,$56	//0/2			const vec_uint4 cSelResults6 = spu_sel(cWriteBackAreas6, cWriteBackSavedAreas6, cWriteBackMasks6)	
	stqa $46, 261952			//1/6			store cSelResults4
	ila	$2,261888					//0/2			gen the LS address for the atomic transfer
	stqa $47, 261968			//1/6			store cSelResults5
	il $51, 0							//0/2			gen the upper EA address for the atomic transfer (0)
	stqa $48, 261984			//1/6			store cSelResults6
	selb $49,$49,$40,$57	//0/2			const vec_uint4 cSelResults7 = spu_sel(cWriteBackAreas7, cWriteBackSavedAreas7, cWriteBackMasks7)		
	wrch	$ch16,$2				//1/4			put the LS address into its channel
//	il $52, 128						//0/2			gen the MFC constant for channel 19
//	wrch	$ch17,$51				//1/4			put the upper EA into its channel (0)
//	wrch	$ch20,$51				//1/4			put the MFC command its channel
	il $53, 180						//0/2			gen the MFC constant for channel 21	
	wrch	$ch18,$66				//1/4			set g_CurAtomicEA as ea for the atomic transfer
	NOP										//0/0	
//	wrch	$ch19,$52				//1/4			put the MFC command its channel
//	NOP										//0/0	
	stqa $49, 262000			//1/6			store cSelResults7			
//	NOP										//0/0	
	wrch	$ch21,$53				//1/4			start the atomic transfer
//	NOP										//0/0	
.RetStartAtomicWriteBack:
	bi $65								//1/4			branch back

.SyncAsyncTransferPref:
	ai	$7,$43,-28					//0/2		cIndex - 28
	hbra .SyncAsyncTransferPrefRet, .SyncAsyncTransferPrefBack	//1/10	branch hint for jump back
	il	$53,2							//0/2		gen MFC_TAG_UPDATE_ALL
//	lnop									//1/0	
	shl	$50,$50,$7				//0/4		const uint32 cCurTagMask = (1<<(cIndex - 28))
//	lnop									//1/0			
	NOP										//0/0
//	lnop									//1/0		
	NOP										//0/0
//	lnop									//1/0			
	NOP										//0/0
	wrch $ch23,$53				//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL)
	NOP										//0/0
	wrch $ch22,$50				//1/6		spu_writech(MFC_WrTagMask, cCurTagMask)
	NOP										//0/0	
	rdch	$50,$ch24				//1/6		spu_readch(MFC_RdTagStat)
	lnop									//1/0		give the hint enough time to fetch the instr.
	lnop									//1/0		same
	NOP										//0/0		
.SyncAsyncTransferPrefRet:	
	br 	.SyncAsyncTransferPrefBack	//1/4 branch back

.SyncAtomicWriteBackPrefSyncAgain:
	//should even more rarely happen that the atomic transfer back fails due to interference by the PPU or another SPU
//	il			$46,128				//0/2		next instr.set up mfc_getllar(g_scWriteBackArea, g_CurAtomicEA, 0, 0)
//	hbra	.SyncAgainCallStartAtomicWriteBack, .StartAtomicWriteBack			//branch hint for call to StartAtomicWriteBack
//	NOP										//0/0
//	rotqbyi	$43,$66,-4		//1/4		
	il			$45,208				//0/2
	wrch		$ch16,$42			//1/6		
//	xswd		$43,$43				//0/2		
//	wrch		$ch17,$43			//1/6
//	NOP										//0/0
	wrch		$ch18,$66			//1/6		
//	NOP										//0/0
//	wrch		$ch19,$46			//1/6
//	NOP										//0/0
//	wrch		$ch20,$44			//1/6
//	NOP										//0/0
	wrch		$ch21,$45			//1/6
	NOP										//0/0	
.SyncAgainCallStartAtomicWriteBack:
	brsl	 $65, .StartAtomicWriteBack		//1/4		call StartAtomicWriteBack

.SyncAtomicWriteBackPref:
	//we have hit the current atomic line written back, it is very unlikely to happen
	NOP										//0/0
	rdch	$2,$ch27				//1/6		int status = mfc_read_atomic_status()
	ila	$42, 261888				//0/2		load g_scWriteBackArea
	brnz	$2, .SyncAtomicWriteBackPrefSyncAgain		//1/4		IF(status != 0, false) sync again
	//next lines implement CopyCacheLine(g_scPreWriteArea, g_scWriteBackArea)
#if defined(_DEBUG)
	ila	$2,_ZN4NSPU7NDriver11g_sLSBufferE		//0/2		load g_scPreWriteArea
#endif
	lqa	$50, 261888				//1/6		
#if defined(_DEBUG)
	NOP										//0/0
#endif	
	lqa	$51, 261904				//1/6		
	il $66, 0							//0/2		g_CurAtomicEA = 0	
	hbra .SyncAtomicWriteBackPrefBack,.PrefetchMissSetup	//1/10
	lqa	$52, 261920				//1/6		
	lqa	$53, 261936				//1/6		
	lqa	$54, 261952				//1/6		
	lqa	$55, 261968				//1/6		
	lqa	$56, 261984				//1/6		
	lqa	$57, 262000				//1/6		
#if defined(_DEBUG)
	stqd $50,0($2)				//1/6
	stqd $51,16($2)				//1/6
	stqd $52,32($2)				//1/6
	stqd $53,48($2)				//1/6			
	stqd $54,64($2)				//1/6
	stqd $55,80($2)				//1/6
	il $38, 2							//0/2		preload 2 for GetReplIndex(g_PrefetchLRUDir)	
	stqd $56,96($2)				//1/6
	il $37, 3							//0/2		preload 3 for GetReplIndex(g_PrefetchLRUDir)		
	stqd $57,112($2)			//1/6			
#else	
	stqa $50,261504				//1/6
	stqa $51,261520				//1/6
	stqa $52,261536				//1/6
	stqa $53,261552				//1/6			
	stqa $54,261568				//1/6
	stqa $55,261584				//1/6
	il $38, 2							//0/2		preload 2 for GetReplIndex(g_PrefetchLRUDir)	
	stqa $56,261600				//1/6
	il $37, 3							//0/2		preload 3 for GetReplIndex(g_PrefetchLRUDir)		
	stqa $57,261616				//1/6
#endif//_DEBUG	
	NOP										//0/0
.SyncAtomicWriteBackPrefBack:
	br	.PrefetchMissSetup//1/4		continue with post prefetch miss code	

#endif //PS3
