#if defined(PS3)

#define NOP nop $127
#define STOPD stop 254

//miss handler for code paging
//behind each instruction: pipeline (0/1) / latency(cycles)

//Code paging cross call miss handler
//	must get implemented via asm to keep the parameter register the same (use high registers)
//	use a bra to keep the link register the same as well (do not forget the branch hint for the bra)
//	$71 will contain the 2x8 bytes of the cross call entry, compare branch offset to link register-4 to obtain the
//		correct destination bubble id and offset
//	pushes the return stack and updates LRU for both bubbles as well
//
//	algorithmic idea (not executed in order necessarily):
//		1. get current source bubble instruction addr. from link register
//		2. get the current source bubble base address from current instruction address
//		3. load bubble directory
//		4. compare instruction address to $71 (g_CrossBubbleData) to obtain the desired destination entry
//		5. use this (4.) to get the destination bubble ID
//		6. get the current slot of the destination bubble
//		7. check state of the bubble, if still in e_BubStateStreaming state: sync its dma transfer
//		8. update its (7.) state to e_BubStateReady
//		9. obtain the current bubble base address from its slot
//		10.compose absolute destination address of dest.bubble and the respective function call
//		11.load current bubble LRU directory and counter
//		12.update LRU for calling and destination bubble slot 
//		13.perform a bi to this address (link register stays untouched)

//	to support SPU Debugging, before exiting the miss handlers, a stopd (snPause) is executed 
//		in case debugging is enabled for the current job


/*	C++ version:
		void CodePagingCallMissHandler()
		{
			const vec_int4 cBubbID4							= spu_splats((int32)spu_extract(g_CrossBubbleData, 1) & NBubBin::SCrossPatch::scMaxBubbleID);
			const vec_uint4 cCurBubMask					= spu_cmpeq(cBubbID4, g_SPUBubbleDir);//mask for source.bubble
			const int32 cDestBubbleID						= (int32)spu_extract(g_CrossBubbleData, 4);
			const bool cIsWeak									= (cDestBubbleID & NBubBin::SCrossPatch::scMaxBubbleID) != cDestBubbleID;
			const uint32 cDestBubbleOff					= (int32)spu_extract(g_CrossBubbleData, 3);
			//use id to get slot/mask of corresponding bubble slot
			const vec_int4 cDestBubbleID4				= spu_splats(cDestBubbleID);
			const vec_uint4 cDestBubMask				= spu_cmpeq(cDestBubbleID4, g_SPUBubbleDir);//mask for dest.bubble
			//update LRU
			g_BubbleLRUCounter									= spu_add(g_BubbleLRUCounter, 1);
			g_SPUBubbleLRUDir	 									= spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cCurBubMask);
			g_SPUBubbleLRUDir	 									= spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cDestBubMask);			
			//get index of the bubble to access g_SPUBubbleStates
			const uint32 cDestBubSlot						= spu_extract(spu_orx(spu_and(cDestBubMask, g_SPUBubbleIndexMask0)), 0);
			//compose full address for the branch destination
			const uint32 cAbsBranchTargetAddr		= cDestBubbleOff + spu_extract(spu_orx(spu_and(cDestBubMask, g_BubbleMemLower)), 0);
			//sync dma transfer and change state
			SBubbleState& rState = g_SPUBubbleStates[cDestBubSlot];
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.callMissHandlerCalls;
#endif
			if(rState.curState == BUB_STATE_STREAMING)
			{
				rState.curState				= BUB_STATE_READY;
				const uint32 cTagMask	= (1<<(g_scDMABubbleTag0 + cDestBubSlot));
				spu_writech(MFC_WrTagMask, cTagMask);//sync all at once
				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
				spu_readch(MFC_RdTagStat);
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.syncsInCallMissHandler;
#endif			
			}
			g_CurBubIndex = (vec_uint4)cDestBubbleID;
			//update return stack
			vec_ushort8 crossBubDataMod = g_CrossBubbleData;
			const uint16 cCurBubSlot = spu_extract(spu_add(spu_cntlz(spu_gather(cCurBubMask)), -28), 0);			
			crossBubDataMod = spu_insert(cCurBubSlot, crossBubDataMod, 2);
			g_pReturnStackTop += cIsWeak?0:1;			//do not increment return stack if it is a weak cross call
			*((vec_ushort8*)g_pReturnStackTop) = crossBubDataMod;
		}
*/
	.file	"MissHandler_spu.S"
#if defined(CHECK_BUB_HAZARD)	
.BS0:
	.string	"CodePagingCallMissHandler: stack pointer overflow"	
#endif	
.text
	.align 7
	.global	CodePagingCallMissHandler
	.type	CodePagingCallMissHandler, @function
CodePagingCallMissHandler:
	ila $44, 32767					//0/2		load mask for bubbles
	rotqbyi	$60,$71,6				//1/4		spu_extract(g_CrossBubbleData, 4)
	ila $62, 66051					//0/2		load mask for shuffling ints into all 4 slots
	lqa	$61, _ZN4NSPU14g_SPUBubbleDirE	//1/6	load g_SPUBubbleDir
	and $43, $71, $44				//0/2		short mask for spu_extract(g_CrossBubbleData, 1)
#if defined(DO_SPU_PROFILING)
	lqa	$36, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	load ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
	NOP											//0/0		
#endif	
	rotqbyi	$53,$71,4				//1/4		const uint32 cDestBubbleOff	= (int32)spu_extract(g_CrossBubbleData, 3)
#if defined(CHECK_BUB_HAZARD)
	ila $39, _ZN4NSPU13g_ReturnStackE	//0/2		load &g_ReturnStack[0]
	lnop										//1/0
	ila $38, _ZN4NSPU11NCodePaging12BubbleHazardEPKc	//0/2		load &BubbleHazard
	lnop										//1/0	
#endif		
	ila $41, 65535					//0/2		load mask for shorts
	lqa	$51,_ZN4NSPU16g_BubbleMemLowerE	//1/6		load g_BubbleMemLower	
	and $65,$60,$44					//0/2		short mask for spu_extract(g_CrossBubbleData, 4)
	shufb	$58,$43,$43,$62		//1/4		const vec_int4 cBubbID4	= spu_splats((int32)spu_extract(g_CrossBubbleData, 1))
	il $40, 2								//0/0		gen BUB_STATE_READY (and MFC_TAG_UPDATE_ALL)
	lqa	$57,_ZN4NSPU18g_BubbleLRUCounterE	//1/6		load g_BubbleLRUCounter
	ila	$50, _ZN4NSPU17g_SPUBubbleStatesE	//0/2	load NSPU::g_SPUBubbleStates
	shufb	$59,$65,$65,$62		//1/4		const vec_int4 cDestBubbleID4	= spu_splats(cDestBubbleID)
	and $60,$60,$41					//0/2		full short mask to check for scWeak
	lqa	$55,_ZN4NSPU24g_SPUBubbleIndexMaskShl4E	//1/6		load g_SPUBubbleIndexMaskShl4	
	ceq	$58, $58, $61				//0/2		const vec_uint4 cCurBubMask	= spu_cmpeq(cBubbID4, g_SPUBubbleDir);//mask for source.bubble
	lqa	$43,_ZN4NSPU21g_SPUBubbleIndexMask0E	//1/6		load g_SPUBubbleIndexMask0
	and $53,$53,$41					//0/2		short mask for spu_extract(g_CrossBubbleData, 3)
	lqa	$56,_ZN4NSPU17g_SPUBubbleLRUDirE	//1/6		load g_SPUBubbleLRUDir
#if defined(CHECK_BUB_HAZARD)		
	ai $39, $39, 480				//0/2		&g_ReturnStack[0] + (RETURN_STACK_MAX_ENTRIES-1) * sizeof(SReturnStackEntry)
	lnop										//1/0
#endif	
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$35, 6($sp)					//1/4		create insertion mask for callMissHandlerCalls
	NOP											//0/0
	rotqbyi	$2, $36, 4			//1/4		rotate callMissHandlerCalls into preferred slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.callMissHandlerCalls
	shufb	$36, $2, $36, $35	//1/4		reinsert callMissHandlerCalls into its place within g_PerfStats
#endif
	ceq	$59, $59, $61				//0/2		const vec_uint4 cDestBubMask = spu_cmpeq(cDestBubbleID4, g_SPUBubbleDir);//mask for dest.bubble
	gb	$42, $58						//1/4		spu_gather(cDestBubMask)
	ai	$57,$57,1						//0/2		g_BubbleLRUCounter = spu_add(g_BubbleLRUCounter, 1);
	lqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  load g_pReturnStackTop
	and	$52, $59,	$51				//0/2		spu_and(cDestBubMask, g_BubbleMemLower)
	chd	$62,2($sp)					//1/4		generate controls for insertion of (rState.curState = BUB_STATE_READY)
	and	$49, $59,	$55				//0/2		spu_and(cDestBubMask, g_SPUBubbleIndexMaskShl4) (generates offset into SBubbleState)
	chd	$64,4($sp)					//1/4		generate controls for insertion of SReturnStackEntry::bubbleSlot
	and	$54, $59,	$43				//0/2		spu_and(cDestBubMask, g_SPUBubbleIndexMask0)
	orx	$52,$52							//1/4		spu_extract(spu_orx(spu_and(cDestBubMask, g_BubbleMemLower)), 0)
	clz	$42, $42						//0/2		spu_cntlz(spu_gather(cDestBubMask))
	orx	$49,$49							//1/4		const uint32 cDestBubSlotShl4	= spu_extract(spu_orx(spu_and(cDestBubMask, g_SPUBubbleIndexMaskShl4)), 0)
	selb $56,$56,$57,$58		//0/2		g_SPUBubbleLRUDir	= spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cCurBubMask)
	orx	$54,$54							//1/4		const uint32 cDestBubSlot	= spu_extract(spu_orx(spu_and(cDestBubMask, g_SPUBubbleIndexMask0)), 0)
	ai	$42,$42,-28					//0/2		spu_add(spu_cntlz(spu_gather(cDestBubMask)), -28)
	stqa	$57,_ZN4NSPU18g_BubbleLRUCounterE	//1/6		store g_BubbleLRUCounter		
	a	$52, $52, $53					//0/2		const uint32 cAbsBranchTargetAddr	= cDestBubbleOff + spu_extract(spu_orx(spu_and(cDestBubMask, g_BubbleMemLower)), 0)
#if defined(SUPP_SN)
	lqa $34, _ZN4NSPU7NDriver13g_sDebugStateE	//1/6  load g_sDebugState
#else	
	lnop										//1/0
#endif	
	ai	$41, $63, 16				//0/2		++g_pReturnStackTop
	lqx	$48,$49,$50					//1/6		SBubbleState& rState = g_SPUBubbleStates[cDestBubSlot]
	ai $47, $54, 8					//0/2		g_scDMABubbleTag0 + cDestBubSlot
	hbr .Ret, $52						//1/10	branch hint for absolute branch on return
	selb $56,$56,$57,$59		//0/2		g_SPUBubbleLRUDir	= spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cDestBubMask)
	shufb	$42,$42,$71,$64		//1/4		crossBubDataMod = spu_insert(cCurBubSlot, crossBubDataMod, 2)
#if defined(CHECK_BUB_HAZARD)		
	cgt $39, $41, $39				//0/2		((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1])
	rotqbyi	$37, $3, 0			//1/4		save $3
	ila $3, .BS0						//0/2		put string into parameter register for BubbleHazard()
	binz $39, $38						//1/4		if((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1]) BubbleHazard()
	ori $3, $37, 0					//0/2		restore $3
	lnop										//1/0
	il	$46, 1							//0/2		gen 1	
#else	
	il	$46, 1							//0/2		gen 1
#endif	
	stqa $65, _ZN4NSPU13g_CurBubIndexE	//1/6		g_CurBubIndex = cDestBubbleID
	ceq $66,$65,$60					//0/2		if equal -> isWeak == 0
	lnop										//1/0	
	ceqhi $45,$48,2					//0/2		rState.curState == BUB_STATE_READY
	shufb	$59,$40,$48,$62		//1/4		rState.curState = BUB_STATE_READY (no store yet)
	selb $63,$63,$41,$66		//0/2		increment return stack only if it is a non weak cross call
	stqa	$56,_ZN4NSPU17g_SPUBubbleLRUDirE	//1/6		store g_SPUBubbleLRUDir
	shl	$51,$46,$47					//0/4		const uint32 cTagMask	= (1<<(g_scDMABubbleTag0 + cDestBubSlot))
	brhz $45,.SyncDMA				//1/4		if(rState.curState != BUB_STATE_READY) branch, expect it not
	NOP											//0/0	
//	lnop										//1/0
//	NOP											//0/0	
	stqd $42, 0($41)				//1/6		*((vec_ushort8*)g_pReturnStackTop) = crossBubDataMod
//	NOP											//0/0		nops used to pad the required slots for the branch hint
//	lnop										//1/0
	NOP											//0/0
#if defined(DO_SPU_PROFILING)
	stqa	$36, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	save ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
	NOP											//0/0
#endif
	stqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  store incremented g_pReturnStackTop
	NOP											//0/0	
#if defined(SUPP_SN)
	brnz	$34, .SPU_DEBUG_BREAK	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.Ret:
	bi	$52									//1/4		branch to the calculated absolute address in the desired bubble
.SyncDMA:
	//do not try to add a branch hint here, will hang up SPU
//	NOP											//0/0	
	wrch $ch22,$51					//1/6		spu_writech(MFC_WrTagMask, cTagMask)
//	NOP											//0/0	
	wrch $ch23,$40					//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
//	NOP											//0/0		
	stqd $42, 0($41)				//1/6		*((vec_ushort8*)g_pReturnStackTop) = crossBubDataMod
//	NOP											//0/0		
	stqx $59,$49,$50				//1/6		store updated rState
	NOP											//0/0
	stqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  store incremented g_pReturnStackTop	
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$35, 10($sp)				//1/4		create insertion mask for syncsInCallMissHandler
	NOP											//0/0
	rotqbyi	$2, $36, 8			//1/4		rotate syncsInCallMissHandler into preferred slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.syncsInCallMissHandler
	shufb	$36, $2, $36, $35	//1/4		reinsert syncsInCallMissHandler into its place within g_PerfStats
#endif	
//	NOP											//0/0	
	rdch $57,$ch24					//1/6		spu_readch(MFC_RdTagStat)
//	NOP											//0/0	
#if defined(SUPP_SN)
	brnz	$34, .SPU_DEBUG_BREAK	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.RetDMA:
	bi	$52									//1/4		branch to the calculated absolute address in the desired bubble	
#if defined(SUPP_SN)	
.SPU_DEBUG_BREAK:
	lqa	$54,16							//1/6		load contents of upper 16 bytes of SPUDriver image spu_mod_hdr
	cwd	$55,12($sp)					//1/4		generate control word for insertion into spu_mod_hdr->pad
	shufb	$54,$52,$54,$55		//1/4		insert branch target
	stqa	$54,16						//1/6		store updated spu_mod_hdr
	NOP											//0/0										
	lqa	$54,16							//1/6		make sure it has completed storage before stop
	STOPD										//1/4		snPause()
	bi	$52									//1/4		branch to the calculated absolute address in the desired bubble	
#endif
	
	.size	CodePagingCallMissHandler, .-CodePagingCallMissHandler





//Code paging cross bubble return miss handler
//	must get implemented via asm to keep the $lr the same and for speed reason
//	each entry function is permanently patched to the return miss handler
//		also inner calls to those entry functions will go through it
//	it extracts the current bubble ID from the $lr to determine if it is a inner bubble return call
//		if not, then it pops the return stack to obtain the desired bubble ID and slot, 
//			special check needs to get added for the actual return into the execute function of the job
//	if the return bubble is in the same slot as required, the branch goes straight using the $lr
//	if bubble is not present in its expected slot, stream and replace the bubble accordingly
//	the return bubble can be in the same bubble slot as currently in 
//		since the return miss handler code is outside the replaced bubble
//	perform a branch indirect to the destination given by $lr
//
//	to support SPU Debugging, before exiting the miss handlers, a stopd (snPause) is executed 
//		in case debugging is enabled for the current job


/* C++ version (not working, pseudo template)
	void CodePagingReturnMissHandler()
	{
		vec_uint4 curLR;
		asm volatile("ori %0, $lr, 0" : "=r"(curLR) : : );//copy link register
		const vec_uint4 cLR4							= spu_splats(spu_extract(curLR, 0));
		//load top entry of return stack (preload, will not use it if it is the job return or an inner bubble return
		const vec_ushort8 cTopEntry				= *(vec_ushort8*)g_pReturnStackTop;
		const uint16 cBubbleID						= spu_extract(cTopEntry, 1);
		const uint32 cRequBubSlot					= (uint32)spu_extract(cTopEntry, 2);
		//get index of destination bubble slot
		const vec_int4 cDestBubbleID4			= spu_splats((int)cBubbleID);
		const vec_uint4 cDestBubMask			= spu_cmpeq(cDestBubbleID4, g_SPUBubbleDir);//mask for dest.bubble
		const uint32 cDestBubSlot					= spu_extract(spu_add(spu_cntlz(spu_gather(cDestBubMask)), -28), 0);
		const vec_uint4 cLowerAddrCmpRes	= spu_cmpgt(cLR4, g_BubbleMemLower);
		const vec_uint4 cUpperAddrCmpRes	= spu_cmpgt(g_BubbleMemUpper, cLR4);
		const vec_uint4 cCurBubMask				= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes);
		const uint32 cCurBubbleID					= spu_extract(spu_orx(spu_and(cCurBubMask, g_SPUBubbleDir)), 0);
		const vec_uint4 cCurBubbleCmp			= spu_cmpeq(g_CurBubIndex, (vec_uint4)cCurBubbleID);//only lowest slot matter
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.returnMissHandlerCalls;
#endif
		const bool cSimpleFastRet					= 
			(cBubbleID == SReturnStackEntry::cIsJobBubble) ||	//return into job
			(cDestBubSlot == cRequBubSlot) ||									//bubble is present at its slot
			(spu_extract(cCurBubbleCmp, 0) != 0);							//inner bubble return
		g_pReturnStackTop									= (SReturnStackEntry*)(void*)spu_extract(spu_sel((vec_int4)((int32)g_pReturnStackTop -16), (vec_int4)(int32)g_pReturnStackTop, cCurBubbleCmp), 0);
		g_BubbleLRUCounter								= spu_add(g_BubbleLRUCounter, 1);//IncrBubbleLRU()
		g_CurBubIndex											= spu_sel((vec_uint4)spu_extract(cDestBubbleID4, 0),g_CurBubIndex,cCurBubbleCmp);
		IF(cSimpleFastRet, true)
		{
			//update LRU
			g_SPUBubbleLRUDir = spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cDestBubMask);
			asm volatile("bi %0" : "=r"(curLR) : : );
		}
		//replace bubble at desired slot
		SBubbleState& rState	= g_SPUBubbleStates[cRequBubSlot];
		const uint32 cTagMask	= (1<<(g_scDMABubbleTag0 + cRequBubSlot));
		//start fenced transfer (fenced because we might need to replace a non synced transfer
		const uint32 cBubEA		= g_GlobalSPUBubbleDir[cBubbleID].ea;
		const uint32 cBubSize	= g_GlobalSPUBubbleDir[cBubbleID].size;
		const uint32 cBubbleDestLS	= (uint32)(g_SPUBubbleMem + g_SPUBubbleSize * cRequBubSlot);
		MemcpyLargeLSFenced(cBubbleDestLS, cBubEA, cBubSize, g_scDMABubbleTag0+cRequBubSlot);
#if defined(DO_SPU_PROFILING)
		NSPU::NDriver::g_PerfStats.bubMemTransferred += cBubSize;
		++NSPU::NDriver::g_PerfStats.bubblesTransferred;
		++NSPU::NDriver::g_PerfStats.bubbleMissesRetMissHandler;
#endif
		*(vec_uint4*)&rState = spu_splats((uint32)0);//reset rState
		rState.curState		= BUB_STATE_READY;
		rState.curIndex		=	cBubbleID;
		//update LRU
		g_SPUBubbleLRUDir = spu_insert(spu_extract(g_BubbleLRUCounter, 0), g_SPUBubbleLRUDir, cRequBubSlot);
		g_SPUBubbleDir		= spu_insert(cBubbleID, g_SPUBubbleDir, cRequBubSlot);
		spu_writech(MFC_WrTagMask, cTagMask);//sync all at once
		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
		spu_readch(MFC_RdTagStat);
		asm volatile("bi %0" : "=r"(curLR) : : );
	}
*/

#if defined(CHECK_BUB_HAZARD)	
.BS1:
	.string	"CodePagingReturnMissHandler: stack pointer underflow"	
.BS2:
	.string	"CodePagingReturnMissHandler: bubble size < 128 byte"	
#endif	
.text
	.align 7
	.global	CodePagingReturnMissHandler
	.type	CodePagingReturnMissHandler, @function
CodePagingReturnMissHandler:
	ila $65, 66051					//0/2		load mask for shuffling ints into all 4 slots
	lqa	$62, _ZN4NSPU17g_pReturnStackTopE			//1/6		load g_pReturnStackTop
	ila	$56, 65535					//0/2		load mask for shorts
	lqa	$64, _ZN4NSPU16g_BubbleMemLowerE			//1/6		load g_BubbleMemLower
	il	$19, 16383					//0/2		gen 16 *1024 - 1
	lqa	$63, _ZN4NSPU16g_BubbleMemUpperE			//1/6		load g_BubbleMemUpper
	il	$32, 1							//0/2		gen 1											//0/0	
	shufb	$59, $lr, $lr, $65//1/4		const vec_uint4 cLR4 = spu_splats(spu_extract(curLR, 0))
	andi $33, $33, 0				//0/2		nullify rState
	lqa	$60, _ZN4NSPU14g_SPUBubbleDirE				//1/6		load g_SPUBubbleDir
	il	$7,66								//0/2		gen MFC_CMD_WORD(0, 0, MFC_GETF_CMD)
	lqa	$39,_ZN4NSPU18g_BubbleLRUCounterE			//1/6		load g_BubbleLRUCounter
	ai	$55, $62, -16				//0/2		(vec_int4)((int32)g_pReturnStackTop -16)
	lqd $58, 0($62)					//1/6		const SReturnStackEntry cTopEntry = *g_pReturnStackTop
	clgt $48, $59, $64			//0/2		const vec_uint4 cLowerAddrCmpRes = spu_cmpgt(cLR4, g_BubbleMemLower)	
	lqa	$37, _ZN4NSPU17g_SPUBubbleLRUDirE			//1/6		load g_SPUBubbleLRUDir
	clgt $47, $63, $59			//0/2		const vec_uint4 cUpperAddrCmpRes = spu_cmpgt(g_BubbleMemUpper, cLR4)
	lqa	$43,_ZN4NSPU13g_CurBubIndexE					//1/6		load g_CurBubIndex
	ila	$35, _ZN4NSPU17g_SPUBubbleStatesE			//0/2		&g_SPUBubbleStates[0]
#if defined(SUPP_SN)
	lqa $54, _ZN4NSPU7NDriver13g_sDebugStateE	//1/6  load g_sDebugState
#else	
	lnop										//1/0
#endif	
#if defined(CHECK_BUB_HAZARD)		
	ila $31, _ZN4NSPU13g_ReturnStackE	//0/2		load &g_ReturnStack[0]
	lnop										//1/0
	ila $30, _ZN4NSPU11NCodePaging12BubbleHazardEPKc	//0/2		load &BubbleHazard
	lnop										//1/0
#endif	
	and	$46, $47, $48				//0/2		const vec_uint4 cCurBubMask	= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes)
	lqa	$59,_ZN4NSPU20g_GlobalSPUBubbleDirE		//1/6		load g_GlobalSPUBubbleDir
	ai	$38, $39, 1					//0/2		++g_BubbleLRUCounter
	lqa $22, _ZN4NSPU15g_SPUBubbleSizeE				//1/6		load g_SPUBubbleSize
	and	$53, $58, $56				//0/2		const uint16 cBubbleID = spu_extract(cTopEntry, 1)
	rotqbyi	$52, $58, 2			//1/4		spu_extract(cTopEntry, 2)
	and	$45, $46, $60				//0/2		spu_and(cCurBubMask, g_SPUBubbleDir)
	lqa $21, _ZN4NSPU14g_SPUBubbleMemE				//1/6		load g_SPUBubbleMem
	ceq	$42, $53, $56				//0/2		(cBubbleID == SReturnStackEntry::cIsJobBubble)
	shufb	$51, $53, $53, $65//1/4		const vec_int4 cDestBubbleID4	= spu_splats((int)cBubbleID)
	shli	$28, $53, 3				//0/4		sizeof(SBubbleInfo) * cBubbleID
	orx	$45, $45						//1/4		const uint32 cCurBubbleID = spu_extract(spu_orx(spu_and(cCurBubMask, g_SPUBubbleIndexMask0)), 0)
	and	$52, $52, $56				//0/2		const uint32 cRequBubSlot = (uint32)spu_extract(cTopEntry, 2)
#if defined(DO_SPU_PROFILING)
	lqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	load ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
#else
	lnop										//1/0
#endif	
	NOP											//0/0
	stqa	$38,_ZN4NSPU18g_BubbleLRUCounterE			//1/6		store ++g_BubbleLRUCounter
	ceq	$50, $51,$60				//0/2		const vec_uint4 cDestBubMask = spu_cmpeq(cDestBubbleID4,g_SPUBubbleDir)
	lnop										//1/0
	ceq	$44, $45,$43 				//0/2		const vec_uint4 cCurBubbleCmp = spu_cmpeq(g_CurBubIndex, (vec_uint4)cCurBubbleID)
	lnop										//1/0
	selb $36, $37, $38, $50	//0/2		g_SPUBubbleLRUDir = spu_sel(g_SPUBubbleLRUDir, g_BubbleLRUCounter, cDestBubMask)
	gb	$49, $50						//1/4		spu_gather(cDestBubMask)
	or	$42, $42, $44				//0/2		(cBubbleID == SReturnStackEntry::cIsJobBubble) || (spu_extract(cCurBubbleCmp, 0) != 0)
	hbr .SimpleFastRet, $lr	//1/10  branch hint for expected simple return destination (link register cont.)
	selb $40, $55, $62, $44 //0/2	  g_pReturnStackTop = (SReturnStackEntry*)(void*)spu_extract(spu_sel((vec_int4)((int32)g_pReturnStackTop -16), (vec_int4)(int32)g_pReturnStackTop, cCurBubbleCmp), 0)
	stqa $36, _ZN4NSPU17g_SPUBubbleLRUDirE			//1/6		store updated g_SPUBubbleLRUDir (overwritten if !cSimpleFastRet)
	selb $51, $51, $43, $44	//0/2		g_CurBubIndex = spu_sel(cDestBubbleID4,g_CurBubIndex,cCurBubbleCmp)
	lqx	$27, $59, $28				//1/6		load g_GlobalSPUBubbleDir[cBubbleID]
#if defined(CHECK_BUB_HAZARD)		
	sfi $31, $31, 1					//0/2		subtract 1 to compare for greater than
	rotqbyi	$16, $3, 0			//1/4		save $3	
	cgt $31, $40, $31				//0/2		(g_pReturnStackTop >= &g_ReturnStack[0])
	lnop										//1/0
	ila $3, .BS1						//0/2		put string into parameter register for BubbleHazard()
	biz $31, $30						//1/4		if((uint32)g_pReturnStackTop < &g_ReturnStack[0]) BubbleHazard()
	ori	$3, $16, 0					//0/2		restore $3
	lnop										//1/0	
#endif
	clz	$49, $49						//0/2		spu_cntlz(spu_gather(cDestBubMask))
	stqa $40, _ZN4NSPU17g_pReturnStackTopE			//1/6		store updated g_pReturnStackTop
	shli $15, $52, 2				//0/4		cRequBubSlot * 4 for insertion mask for g_SPUBubbleLRUDir
#if defined(DO_SPU_PROFILING)	
	rotqbyi $39, $29, 0			//1/4		copy orig. contents of NSPU::NDriver::g_PerfStats
#else	
	lnop										//1/0
#endif	
	ai	$49, $49, -28				//0/2		const uint32 cDestBubSlot	= spu_extract(spu_add(spu_cntlz(spu_gather(cDestBubMask)), -28), 0)
	stqd	$sp,-80($sp)			//1/6		store stack pointer for cellDmaLargeCmd
	andi $26, $28, 8				//0/2		gen rotation number to access g_GlobalSPUBubbleDir[cBubbleID].ea
	stqd $lr, -32($sp)			//1/6		store link register for cellDmaLargeCmd
	ceq	$41, $49, $52				//0/2		(cDestBubSlot == cRequBubSlot)
	stqa $51,	_ZN4NSPU13g_CurBubIndexE					//1/6		store g_CurBubIndex = (vec_uint4)spu_extract(cDestBubbleID4, 0)
	ai	$25, $26, 4					//0/2		gen rotation number to access g_GlobalSPUBubbleDir[cBubbleID].size
	rotqby	$4, $27, $26		//1/4		const uint32 cBubEA	= g_GlobalSPUBubbleDir[cBubbleID].ea
	or	$42, $42, $41				//0/2		const bool cSimpleFastRet = (complete now)
	cwx	$14, $sp, $15				//1/4		complete insertion mask for g_SPUBubbleLRUDir
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$16, 4($sp)					//1/4		create insertion mask for returnMissHandlerCalls
	NOP											//0/0
	rotqbyi	$2, $29, 2			//1/4		rotate NSPU::NDriver::g_PerfStats.returnMissHandlerCalls into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.returnMissHandlerCalls
	shufb	$29, $2, $29, $16	//1/4		reinsert returnMissHandlerCalls into the proper pos
	ori $39, $29, 0					//0/2		(STALL 3) copy new original contents	
	stqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80		//1/6 store updated g_PerfStats
#endif	
	ai $6, $52, 8						//0/2		g_scDMABubbleTag0 + cRequBubSlot	
	rotqby	$5, $27, $25		//1/4		const uint32 cBubSize	= g_GlobalSPUBubbleDir[cBubbleID].size
	mpyu $20, $22, $52			//0/7		g_SPUBubbleSize * cRequBubSlot (both operands are uint16)
#if defined(SUPP_SN)			//modelled so that is does not branch if no debugging is enabled and cSimpleFastRet is true
	brz $42, .ReplaceBubbles//1/4		if(!cSimpleFastRet) replace bubble
	brnz	$54, .SPU_DEBUG_BREAK_RET	//1/4		if(IsDebugEnabled()) snPause()
#endif	
.SimpleFastRet:
	binz $42, $lr						//1/4		if(cSimpleFastRet) branch to link register cont.

.ReplaceBubbles:
	shli $34, $52, 4				//0/4		sizeof(SBubbleState) * cRequBubSlot	
	lnop										//1/0	
	shl	$65,$32,$6					//0/4		const uint32 cTagMask	= (1<<(g_scDMABubbleTag0 + cRequBubSlot))
	shufb	$13, $38, $37, $14//1/4		g_SPUBubbleLRUDir = spu_insert(spu_extract(g_BubbleLRUCounter, 0), g_SPUBubbleLRUDir, cRequBubSlot)
	ila $63, 16384					//0/2		load 16*1024
	shufb	$12, $53, $60, $14//1/4		g_SPUBubbleDir = spu_insert(cBubbleID, g_SPUBubbleDir, cRequBubSlot)
#if defined(CHECK_BUB_HAZARD)	
	clgti $47, $5, 127			//0/2		(cBubSize > 127)
	lnop										//1/0
#endif	
	a $3, $20, $21					//0/2		const uint32 cBubbleDestLS = (uint32)(g_SPUBubbleMem + g_SPUBubbleSize * cRequBubSlot)
	lnop										//1/0	
	ori $60, $5, 0					//0/2		int sizeLeft = (int)cSize
	stqa	$13, _ZN4NSPU17g_SPUBubbleLRUDirE			//1/6		store updated g_SPUBubbleLRUDir
	a	$11, $35, $34					//0/2		&g_SPUBubbleStates[cRequBubSlot]
	lnop										//1/0	
	ori $61, $3, 0					//0/2   uint32 curDest = (uint32)dest
	stqa	$12, _ZN4NSPU14g_SPUBubbleDirE				//1/6		store updated g_SPUBubbleDir
#if defined(CHECK_BUB_HAZARD)		
	ori $16, $3, 0					//0/2		save $3
	ila $3, .BS2						//0/2		put string into parameter register for BubbleHazard()
	biz $47, $30						//1/4		if(cBubSize <= 127) BubbleHazard()
	ori	$3, $16, 0					//0/2		restore $3	
	lnop										//1/0	
#endif
	ori $62, $4, 0					//0/2   uint32 curSource = cSource
	chd	$31, 2($11)					//1/4		generate controls for updating rState.curState	
	cgt $8,$60,$63					//0/2		(sizeLeft>16*1024)?	(for first iteration)
#if defined(DO_SPU_PROFILING)
	cwd	$22,0($sp)					//1/4		gen insertion mask for g_PerfStats.bubMemTransferred
	a	$29, $29, $5					//0/2		NSPU::NDriver::g_PerfStats.bubMemTransferred += cBubSize
	shufb	$29,$29,$39,$22		//1/4		(STALL 3) mask non affected bytes
	ori $39, $29, 0					//0/2		(STALL 3) copy new original contents
#endif
	lnop										//1/0	
.MemcpyLS:
	il $16, 2								//0/2		gen MFC_TAG_UPDATE_ALL
	wrch	$ch20, $6					//1/6		si_wrch(MFC_TagID,si_from_uint(g_scDMABubbleTag0+cRequBubSlot))
.MemcpyLargeLSLoop:	
	selb $8,$60,$63,$8			//0/2		(sizeLeft>16*1024)?16*1024 : sizeLeft)
	wrch $ch18, $62					//1/6		si_wrch(MFC_EAL,si_from_uint(cBubEA))
	sf	$60,$60,$63					//0/2		sizeLeft	-= 16*1024
	hbr .RetAfterMemcpy, $lr//1/10	branch hint for return jump
	a $62, $62, $63					//0/2		curSource	+= 16*1024
	wrch $ch16, $61					//1/6		si_wrch(MFC_LSA, (int)curDest)
	cgti $9, $60, 0					//0/2		(sizeLeft > 0)?
	wrch	$ch19, $8					//1/6		si_wrch(MFC_Size,si_from_uint(cBubSize))
	cgt $8,$60,$63					//0/2		(sizeLeft>16*1024)?
	wrch	$ch21, $7					//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD))
	a $61, $61, $63					//0/2		curDest	+= 16*1024
	brnz $9, .MemcpyLargeLSLoop		//1/4		while(sizeLeft > 0)
.AfterMemcpy:	
//	NOP											//0/0	
	chd	$55, 0($11)					//1/4		generate controls for updating rState.curIndex	
//	NOP											//0/0
	shufb	$33,$5,$33,$31		//1/4		rState.curState		= BUB_STATE_READY	
//	NOP											//0/0		
	shufb	$33, $53, $33, $55//1/4		(STALL 3) rState.curIndex	=	cBubbleID	
//	NOP											//0/0
	stqd $33, 0($11)				//1/6		(STALL 3) store g_SPUBubbleStates[cRequBubSlot]	
//	NOP											//0/0		
	wrch $ch22,$65					//1/6		spu_writech(MFC_WrTagMask, cTagMask)
//	NOP											//0/0		
	wrch $ch23,$16					//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$16, 12($sp)				//1/4		create insertion mask for bubbleMissesRetMissHandler
	NOP											//0/0
	rotqbyi	$2, $29, 10			//1/4		rotate NSPU::NDriver::g_PerfStats.bubbleMissesRetMissHandler into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.bubbleMissesRetMissHandler
	shufb	$29, $2, $29, $16	//1/4		reinsert bubbleMissesRetMissHandler into the proper pos
	NOP											//0/0
	chd	$16, 14($sp)				//1/4		create insertion mask for bubblesTransferred
	NOP											//0/0
	rotqbyi	$2, $29, 12			//1/4		rotate NSPU::NDriver::g_PerfStats.bubblesTransferred into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.bubblesTransferred
	shufb	$29, $2, $29, $16	//1/4		reinsert bubblesTransferred into the proper pos
#endif	
//	NOP											//0/0	
	rdch $10,$ch24					//1/4		spu_readch(MFC_RdTagStat)
//	NOP											//0/0	
#if defined(DO_SPU_PROFILING)	
	stqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80		//1/6 store updated g_PerfStats
	NOP											//0/0	
#endif
#if defined(SUPP_SN)
	brnz	$54, .SPU_DEBUG_BREAK_RET	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.RetAfterMemcpy:
	bi $lr									//1/4		branch to link register cont.
#if defined(SUPP_SN)
.SPU_DEBUG_BREAK_RET:
	lqa	$54,16							//1/6		load contents of upper 16 bytes of SPUDriver image spu_mod_hdr
	cwd	$55,12($sp)					//1/4		generate control word for insertion into spu_mod_hdr->pad
	shufb	$54,$lr,$54,$55		//1/4		insert branch target
	stqa	$54,16						//1/6		store updated spu_mod_hdr
	NOP											//0/0										
	lqa	$54,16							//1/6		make sure it has completed storage before stop
	STOPD										//1/4		custom snPause()
	bi	$lr									//1/4		branch to link register cont.
#endif

	.size	CodePagingReturnMissHandler, .-CodePagingReturnMissHandler


#endif //PS3
