#if defined(PS3)

#define NOP nop $127
#define STOPD stop 254

#define RETURN_STACK_MAX_ENTRIES 48

//miss handler for code paging
//behind each instruction: pipeline (0/1) / latency(cycles)

//Code paging cross call miss handler
//	must get implemented via asm to keep the parameter register the same (use high registers, keeps 3..19)
//	use a bra to keep the link register the same as well (do not forget the branch hint for the bra)
//	$71 will contain the 2x8 bytes of the cross call entry, compare branch offset to link register-4 to obtain the
//		correct destination page id and offset
//	pushes the return stack and updates LRU for both pages as well
//
//	algorithmic idea (not executed in order necessarily):
//		1. get current source page instruction addr. from link register
//		2. get the current source page base address from current instruction address
//		3. load page directory
//		4. compare instruction address to $71 (g_CrossPageData) to obtain the desired destination entry
//		5. use this (4.) to get the destination page ID
//		6. get the current slot of the destination page
//		7. check state of the page, if still in streaming state: sync its dma transfer
//		8. update its (7.) state to e_PageStateReady
//		9. obtain the current page base address from its slot
//		10.compose absolute destination address of dest.Page and the respective function call
//		11.load current Page LRU directory and counter
//		12.update LRU for calling and destination Page slot 
//		13.perform a bi to this address (link register stays untouched)

//	to support SPU Debugging, before exiting the miss handlers, a stopd (snPause) is executed 
//		in case debugging is enabled for the current job


/*	C++ version:
		void CodePagingCallMissHandler()
		{
			const vec_int4 cPageID4							= spu_splats((int32)spu_extract(g_CrossPageData, 1) & NPageBin::SCrossPatch::scMaxPageID);
			const vec_uint4 cCurPageMask				= spu_cmpeq(cPageID4, g_SPUPageDir);//mask for source.page
			const int32 cDestPageID							= (int32)spu_extract(g_CrossPageData, 4);
			const bool cIsWeak									= (cDestPageID & NPageBin::SCrossPatch::scMaxPageID) != cDestPageID;
			const uint32 cDestPageOff						= (int32)spu_extract(g_CrossPageData, 3);
			//use id to get slot/mask of corresponding page slot
			const vec_int4 cDestPageID4					= spu_splats(cDestPageID);
			const vec_uint4 cDestPageMask				= spu_cmpeq(cDestPageID4, g_SPUPageDir);//mask for dest.page
			//update LRU
			g_PageLRUCounter										= spu_add(g_PageLRUCounter, 1);
			g_SPUPageLRUDir		 									= spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cCurPageMask);
			g_SPUPageLRUDir	 										= spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cDestPageMask);			
			//get index of the page to access g_SPUPageStates
			const uint32 cDestPageSlot					= spu_extract(spu_orx(spu_and(cDestPageMask, g_SPUPageIndexMask0)), 0);
			//compose full address for the branch destination
			const uint32 cAbsBranchTargetAddr		= cDestPageOff + spu_extract(spu_orx(spu_and(cDestPageMask, g_PageMemLower)), 0);
			//sync dma transfer and change state
			SPageState& rState = g_SPUPageStates[cDestPageSlot];
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.callMissHandlerCalls;
#endif
			if(rState.curState == PAGE_STATE_STREAMING)
			{
				rState.curState				= PAGE_STATE_READY;
				const uint32 cTagMask	= (1<<(g_scDMAPageTag0 + cDestPageSlot));
				spu_writech(MFC_WrTagMask, cTagMask);//sync all at once
				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
				spu_readch(MFC_RdTagStat);
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.syncsInCallMissHandler;
#endif			
			}
			bool retStackChangeState = cIsWeak;
			const uint32 cInnerPageCmpRes = spu_extract(spu_cmpeq(spu_promote(g_CurPageIndex,0), cDestPageID),0);
			retStackChangeState |= cInnerPageCmpRes;
			g_CurPageIndex = (vec_uint4)cDestPageID;
			//update return stack
			vec_ushort8 crossPageDataMod = g_CrossPageData;
			const uint16 cCurPageSlot = spu_extract(spu_add(spu_cntlz(spu_gather(cCurPageMask)), -28), 0);			
			crossPageDataMod = spu_insert(cCurPageSlot, crossPageDataMod, 2);
			//do not increment return stack if it is a weak cross call or inner page call
			g_pReturnStackTop += retStackChangeState?0:1;			
			*((vec_ushort8*)g_pReturnStackTop) = crossPageDataMod;
		}
*/
	.file	"MissHandler_spu.S"
#if defined(CHECK_PAGE_HAZARD)	
.BS0:
	.string	"CodePagingCallMissHandler: stack pointer overflow"	
.BS3:
	.string	"CodePagingCallMissHandler: page not present"	
#endif	
.text
	.align 6
	.global	CodePagingCallMissHandler
	.type	CodePagingCallMissHandler, @function
CodePagingCallMissHandler:
	ori $46, $3,0						//0/2   save $3 as there the return val will be stored in case of single/empty ret.value func
	lqa $21, _ZN4NSPU14g_CurPageIndexE	//1/6		load current g_CurPageIndex
	ila $44, 32767					//0/2		load mask for pages
	rotqbyi	$60,$71,6				//1/4		spu_extract(g_CrossPageData, 4)
	ila $62, 66051					//0/2		load mask for shuffling ints into all 4 slots
	lqa	$45, _ZN4NSPU12g_SPUPageDirE	//1/6	load g_SPUPageDir
	and $43, $71, $44				//0/2		short mask for spu_extract(g_CrossPageData, 1)
#if defined(DO_SPU_PROFILING)
	lqa	$36, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	load ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
	NOP											//0/0		
#endif	
	rotqbyi	$3,$71,4				//1/4		const uint32 cDestPageOff	= (int32)spu_extract(g_CrossPageData, 3)
#if defined(CHECK_PAGE_HAZARD)
	ila $39, _ZN4NSPU13g_ReturnStackE	//0/2		load &g_ReturnStack[0]
	ila $38, _ZN4NSPU11NCodePaging10PageHazardEPKc	//0/2		load &PageHazard
#endif		
	ila $41, 65535					//0/2		load mask for shorts
	lqa	$51,_ZN4NSPU14g_PageMemLowerE	//1/6		load g_PageMemLower	
	and $44,$60,$44					//0/2		short mask for spu_extract(g_CrossPageData, 4)
	shufb	$58,$43,$43,$62		//1/4		const vec_int4 cPageID4	= spu_splats((int32)spu_extract(g_CrossPageData, 1))
	ila	$50, _ZN4NSPU15g_SPUPageStatesE	//0/2	load NSPU::g_SPUPageStates
	lqa	$57,_ZN4NSPU16g_PageLRUCounterE	//1/6		load g_PageLRUCounter
	and $53,$3,$41					//0/2		const uint32 cDestPageOff	= (int32)spu_extract(g_CrossPageData, 3)
	shufb	$20,$44,$44,$62		//1/4		const vec_int4 cDestPageID4	= spu_splats(cDestPageID)
	and $60,$60,$41					//0/2		full short mask to check for scWeak
	lqa	$55,_ZN4NSPU22g_SPUPageIndexMaskShl4E	//1/6		load g_SPUPageIndexMaskShl4
	ceq	$58, $58, $45				//0/2		const vec_uint4 cCurPageMask	= spu_cmpeq(cPageID4, g_SPUPageDir);//mask for source.page
	lqa	$43,_ZN4NSPU19g_SPUPageIndexMask0E	//1/6		load g_SPUPageIndexMask0
	il $40, 2								//0/0		gen PAGE_STATE_READY (and MFC_TAG_UPDATE_ALL)
	lqa	$56,_ZN4NSPU15g_SPUPageLRUDirE	//1/6		load g_SPUPageLRUDir
#if defined(CHECK_PAGE_HAZARD)
	ila $35, (RETURN_STACK_MAX_ENTRIES-1)*16
	a $39, $39, $35				  //0/2		&g_ReturnStack[0] + (RETURN_STACK_MAX_ENTRIES-1) * sizeof(SReturnStackEntry)
#endif	
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$35, 6($sp)					//1/4		create insertion mask for callMissHandlerCalls
	NOP											//0/0
	rotqbyi	$2, $36, 4			//1/4		rotate callMissHandlerCalls into preferred slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.callMissHandlerCalls
	shufb	$36, $2, $36, $35	//1/4		reinsert callMissHandlerCalls into its place within g_PerfStats
#endif
	ceq	$59, $20, $45				//0/2		const vec_uint4 cDestPageMask = spu_cmpeq(cDestPageID4, g_SPUPageDir);//mask for dest.page
	gb	$42, $58						//1/4		spu_gather(cDestPageMask)
	ai	$57,$57,1						//0/2		g_PageLRUCounter = spu_add(g_PageLRUCounter, 1);
	lqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  load g_pReturnStackTop
	and	$52, $59,	$51				//0/2		spu_and(cDestPageMask, g_PageMemLower)
	rotqbii $53, $53, 2			//1/4		cDestPageOff << 2 (stored as multiple of 4)	
	and	$49, $59,	$55				//0/2		spu_and(cDestPageMask, g_SPUPageIndexMaskShl4) (generates offset into SPageState)
	chd	$55,4($sp)					//1/4		generate controls for insertion of SReturnStackEntry::pageSlot
	and	$54, $59,	$43				//0/2		spu_and(cDestPageMask, g_SPUPageIndexMask0)
	orx	$52,$52							//1/4		spu_extract(spu_orx(spu_and(cDestPageMask, g_PageMemLower)), 0)
	clz	$42, $42						//0/2		spu_cntlz(spu_gather(cDestPageMask))
	orx	$49,$49							//1/4		const uint32 cDestPageSlotShl4	= spu_extract(spu_orx(spu_and(cDestPageMask, g_SPUPageIndexMaskShl4)), 0)
	selb $56,$56,$57,$58		//0/2		g_SPUPageLRUDir	= spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cCurPageMask)
	orx	$54,$54							//1/4		const uint32 cDestPageSlot	= spu_extract(spu_orx(spu_and(cDestPageMask, g_SPUPageIndexMask0)), 0)
	ceq	$20,$21,$20					//0/2		const uint32 cInnerPageCmpRes = spu_extract(spu_cmpeq(spu_promote(g_CurPageIndex,0), cDestPageID),0)
	rotqbyi $3,$46,0				//1/4		restore $3
	ai	$42,$42,-28					//0/2		spu_add(spu_cntlz(spu_gather(cDestPageMask)), -28)
	chd	$62,2($sp)					//1/4		generate controls for insertion of (rState.curState = PAGE_STATE_READY)
	ceq  $22,$44,$60				//0/2		if equal -> isWeak == 0	
	brz $52, .PageNotPresent//1/4   if(spu_extract(spu_orx(spu_and(cDestPageMask, g_PageMemLower)), 0) == 0) call SetActivePages
	a	$52, $52, $53					//0/2		const uint32 cAbsBranchTargetAddr	= cDestPageOff + spu_extract(spu_orx(spu_and(cDestPageMask, g_PageMemLower)), 0)
	lqx	$48,$49,$50					//1/6		SPageState& rState = g_SPUPageStates[cDestPageSlot]	
	ai	$41, $63, 16				//0/2		++g_pReturnStackTop
#if defined(SUPP_SN)
	lqa $34, _ZN4NSPU7NDriver13g_sDebugStateE	//1/6  load g_sDebugState
#else	
	lnop										//1/0
#endif	
	ai $53, $54, 8					//0/2		g_scDMAPageTag0 + cDestPageSlot
	hbr .Ret, $52						//1/10	branch hint for absolute branch on return
	selb $56,$56,$57,$59		//0/2		g_SPUPageLRUDir	= spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cDestPageMask)
	lnop										//1/0
#if defined(CHECK_PAGE_HAZARD)
	cgt $39, $41, $39				//0/2		((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1])
	rotqbyi	$37, $3, 0			//1/4		save $3
	ila $3, .BS0						//0/2		put string into parameter register for PageHazard()
	binz $39, $38						//1/4		if((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1]) PageHazard()
	ori $3, $37, 0					//0/2		restore $3
	lnop										//1/0
	il	$51, 1							//0/2		gen 1	
#else	
	il	$51, 1							//0/2		gen 1
#endif	
	shufb	$42,$42,$71,$55		//1/4		crossPageDataMod = spu_insert(cCurPageSlot, crossPageDataMod, 2)
	andc	$22,$22,$20				//0/2		bool retStackChangeState = cIsWeak | cInnerPageCmpRes
	stqa $57,_ZN4NSPU16g_PageLRUCounterE	//1/6		store g_PageLRUCounter
	ceqhi $45,$48,2					//0/2		rState.curState == PAGE_STATE_READY
	shufb	$59,$40,$48,$62		//1/4		rState.curState = PAGE_STATE_READY (no store yet)
	selb $63,$63,$41,$22		//0/2		increment return stack only if it is a non weak cross call
	stqa	$56,_ZN4NSPU15g_SPUPageLRUDirE	//1/6		store g_SPUPageLRUDir
	shl	$51,$51,$53					//0/4		const uint32 cTagMask	= (1<<(g_scDMAPageTag0 + cDestPageSlot))
	stqd $42, 0($41)				//1/6		*((vec_ushort8*)g_pReturnStackTop) = crossPageDataMod
	selb $54,$21,$44,$22		//0/2		if isWeak, do not update g_CurPageIndex as it is used in SetActivePages
	brhz $45,.SyncDMA				//1/4		if(rState.curState != PAGE_STATE_READY) branch, expect it not
	NOP											//0/0
#if defined(DO_SPU_PROFILING)
	stqa	$36, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	save ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
	NOP											//0/0
#endif
	stqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  store incremented g_pReturnStackTop
	NOP											//0/0	
	stqa $54, _ZN4NSPU14g_CurPageIndexE	//1/6		g_CurPageIndex = isWeak?g_CurPageIndex:cDestPageID
	NOP											//0/0		
#if defined(SUPP_SN)
	brnz	$34, .SPU_DEBUG_BREAK	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.Ret:
#if defined(CHECK_PAGE_HAZARD)
  lqd $29, 0($52)					//1/6		load jump destination to check for non 0
	brnz $29,.JumpTestPassed1 //1/4
	NOP											//0/0
  stop 255								//1/4		stop debugger if dest.,is zero
.JumpTestPassed1:  
#endif
	bi	$52									//1/4		branch to the calculated absolute address in the desired page
.SyncDMA:
	//do not try to add a branch hint here, will hang up SPU
//	NOP											//0/0	
	wrch $ch22,$51					//1/6		spu_writech(MFC_WrTagMask, cTagMask)
//	NOP											//0/0	
	wrch $ch23,$40					//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
//	NOP											//0/0		
	stqd $42, 0($41)				//1/6		*((vec_ushort8*)g_pReturnStackTop) = crossPageDataMod
//	NOP											//0/0		
	stqx $59,$49,$50				//1/6		store updated rState
	NOP											//0/0
	stqa	$63, _ZN4NSPU17g_pReturnStackTopE		//1/6  store incremented g_pReturnStackTop	
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$35, 10($sp)				//1/4		create insertion mask for syncsInCallMissHandler
	NOP											//0/0
	rotqbyi	$2, $36, 8			//1/4		rotate syncsInCallMissHandler into preferred slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.syncsInCallMissHandler
	shufb	$36, $2, $36, $35	//1/4		reinsert syncsInCallMissHandler into its place within g_PerfStats
	NOP											//0/0
	stqa	$36, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	save ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
#endif	
	NOP											//0/0	
	stqa $54, _ZN4NSPU14g_CurPageIndexE	//1/6		g_CurPageIndex = isWeak?g_CurPageIndex:cDestPageID	
	rdch $57,$ch24					//1/6		spu_readch(MFC_RdTagStat)
//	NOP											//0/0	
#if defined(SUPP_SN)
	brnz	$34, .SPU_DEBUG_BREAK	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.RetDMA:
#if defined(CHECK_PAGE_HAZARD)
  lqd $29, 0($52)					//1/6		load jump destination to check for non 0
	brnz $29,.JumpTestPassed2 //1/4
	NOP											//0/0
  stop 255								//1/4		stop debugger if dest.,is zero
.JumpTestPassed2:  
#endif
	bi	$52									//1/4		branch to the calculated absolute address in the desired page	
#if defined(SUPP_SN)	
.SPU_DEBUG_BREAK:
	stqa $lr,261616					//1/6
	STOPD										//1/4		custom snPause()
	NOP											//0/0
	bi	$52									//1/4		branch to the calculated absolute address in the desired page	
#endif

#save registers lr,3,4,5,6, generate the page id's, call SetActivePages and sync dma as above
.PageNotPresent:
  ori $25, $3, 0					//0/2   save $3
	lnop										//1/0
  ori $24, $lr, 0					//0/2   save $lr
  rotqbyi $23,$53,0				//1/4   save $53 = cDestPageOff
  ori $26, $4, 0					//0/2   save $4
  fsmb $4, $44 					  //1/0   enocde cDestPageID for SetActivePages
  ori $27, $5, 0					//0/2   save $5
//	lnop										//1/0
  ori $28, $6, 0					//0/2   save $6
//	lnop										//1/0
	il $5,-0x0001						//0/2   spu_maskb((unsigned short)65535)
  fsmb $3, $21						//1/4   enocde current page id for SetActivePages
	il $6,-0x0001						//0/2   spu_maskb((unsigned short)65535)
	brsl	$lr, SetActivePages	//1/4  call SetActivePages
  ori $lr, $24, 0					//0/2   restore $lr
	hbra .PageNotPresentCallSync, .SyncDMA	//1/10	branch hint for branch to sync of dma transfer
  ori $3, $25, 0					//0/2   restore $3
  rotqbyi $49, $39, 0			//1/4		move cDestPageSlotShl4 into expected register
  ori $4, $26, 0					//0/2   restore $4
//  lnop										//1/0   	
  ori $5, $27, 0					//0/2   restore $5
//  lnop										//1/0   	
  ori $6, $28, 0					//0/2   restore $6
	shufb	$42,$42,$71,$55		//1/4		crossPageDataMod = spu_insert(cCurPageSlot, crossPageDataMod, 2)
	a	$52, $52, $23					//0/2		const uint32 cAbsBranchTargetAddr	= cDestPageOff + page dest address
#if defined(SUPP_SN)
	lqa $34, _ZN4NSPU7NDriver13g_sDebugStateE	//1/6  load g_sDebugState
#else	
	lnop										//1/0
#endif
	ai	$41, $63, 16				//0/2		++g_pReturnStackTop
	lnop										//1/0
	or		$22,$22,$20				//0/2		bool retStackChangeState = cIsWeak | cInnerPageCmpRes
  lnop										//1/0   	
#if defined(CHECK_PAGE_HAZARD)
	ila $39, _ZN4NSPU13g_ReturnStackE	//0/2		load &g_ReturnStack[0]
	ila $38, _ZN4NSPU11NCodePaging10PageHazardEPKc	//0/2		load &PageHazard
	ila $35, (RETURN_STACK_MAX_ENTRIES-1)*16
	a $39, $39, $35				  //0/2		&g_ReturnStack[0] + (RETURN_STACK_MAX_ENTRIES-1) * sizeof(SReturnStackEntry)
	cgt $39, $41, $39				//0/2		((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1])
	rotqbyi	$2, $3, 0				//1/4		save $3
	ila $3, .BS0						//0/2		put string into parameter register for PageHazard()
	binz $39, $38						//1/4		if((uint32)g_pReturnStackTop > (uint32)&g_ReturnStack[RETURN_STACK_MAX_ENTRIES-1]) PageHazard()
	ori $3, $2, 0						//0/2		restore $3
	lnop										//1/0
	il	$51, 1							//0/2		gen 1	
#else	
	il	$51, 1							//0/2		gen 1
#endif	
	shufb	$59,$40,$48,$62		//1/4		rState.curState = PAGE_STATE_READY (no store yet)
	selb $54,$21,$44,$22		//0/2		if isWeak, do not update g_CurPageIndex as it is used in SetActivePages	
	lnop										//1/0
	selb $63,$63,$41,$22		//0/2		increment return stack only if it is a non weak cross call
	stqd $42, 0($41)				//1/6		*((vec_ushort8*)g_pReturnStackTop) = crossPageDataMod
	shl	$51,$51,$53					//0/4		const uint32 cTagMask	= (1<<(g_scDMAPageTag0 + cDestPageSlot))
.PageNotPresentCallSync:	
	br .SyncDMA							//1/4   sync dma
	
	.size	CodePagingCallMissHandler, .-CodePagingCallMissHandler





//Code paging cross page return miss handler
//	must get implemented via asm to keep the $lr the same and for speed reason
//	each entry function is permanently patched to the return miss handler
//		also inner calls to those entry functions will go through it
//	it extracts the current page ID from the $lr to determine if it is a inner page return call
//		if not, then it pops the return stack to obtain the desired page ID and slot, 
//			special check needs to get added for the actual return into the execute function of the job
//	if the return page is in the same slot as required, the branch goes straight using the $lr
//	if page is not present in its expected slot, stream and replace the page accordingly
//	the return page can be in the same page slot as currently in 
//		since the return miss handler code is outside the replaced page
//	perform a branch indirect to the destination given by $lr
//
//	to support SPU Debugging, before exiting the miss handlers, a stopd (snPause) is executed 
//		in case debugging is enabled for the current job


/* C++ version (not working, pseudo template)
	void CodePagingReturnMissHandler()
	{
		vec_uint4 curLR;
		asm volatile("ori %0, $lr, 0" : "=r"(curLR) : : );//copy link register
		const vec_uint4 cLR4							= spu_splats(spu_extract(curLR, 0));
		//load top entry of return stack (preload, will not use it if it is the job return or an inner page return
		const vec_ushort8 cTopEntry				= *(vec_ushort8*)g_pReturnStackTop;
		const uint16 cPageID							= spu_extract(cTopEntry, 1);
		const uint32 cRequPageSlot				= (uint32)spu_extract(cTopEntry, 2);
		//get index of destination page slot
		const vec_int4 cDestPageID4			= spu_splats((int)cPageID);
		const vec_uint4 cDestPageMask			= spu_cmpeq(cDestPageID4, g_SPUPageDir);//mask for dest.page
		const uint32 cDestPageSlot					= spu_extract(spu_add(spu_cntlz(spu_gather(cDestPageMask)), -28), 0);
		const vec_uint4 cLowerAddrCmpRes	= spu_cmpgt(cLR4, g_PageMemLower);
		const vec_uint4 cUpperAddrCmpRes	= spu_cmpgt(g_PageMemUpper, cLR4);
		const vec_uint4 cCurPageMask				= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes);
		const uint32 cCurPageID						= spu_extract(spu_orx(spu_and(cCurPageMask, g_SPUPageDir)), 0);
		const vec_uint4 cCurPageCmp				= spu_cmpeq(g_CurPageIndex, (vec_uint4)cCurPageID);//only lowest slot matter
#if defined(DO_SPU_PROFILING)
		++NSPU::NDriver::g_PerfStats.returnMissHandlerCalls;
#endif
		const bool cSimpleFastRet					= 
			(cPageID == SReturnStackEntry::cIsJobPage) ||	//return into job
			(cDestPageSlot == cRequPageSlot) ||						//page is present at its slot
			(spu_extract(cCurPageCmp, 0) != 0);					//inner page return
		g_pReturnStackTop									= (SReturnStackEntry*)(void*)spu_extract(spu_sel((vec_int4)((int32)g_pReturnStackTop -16), (vec_int4)(int32)g_pReturnStackTop, cCurPageCmp), 0);
		g_PageLRUCounter									= spu_add(g_PageLRUCounter, 1);//IncrPageLRU()
		g_CurPageIndex										= spu_sel((vec_uint4)spu_extract(cDestPageID4, 0),g_CurPageIndex,cCurPageCmp);
		IF(cSimpleFastRet, true)
		{
			//update LRU
			g_SPUPageLRUDir = spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cDestPageMask);
			asm volatile("bi %0" : "=r"(curLR) : : );
		}
		//replace page at desired slot
		SPageState& rState	= g_SPUPageStates[cRequPageSlot];
		const uint32 cTagMask	= (1<<(g_scDMAPageTag0 + cRequPageSlot));
		//start fenced transfer (fenced because we might need to replace a non synced transfer
		const uint32 cPageEA		= g_GlobalSPUPageDir[cPageID].ea;
		const uint32 cPageSize	= g_GlobalSPUPageDir[cPageID].size;
		const uint32 cPageDestLS	= (uint32)(g_SPUPageMem + g_SPUPageSize * cRequPageSlot);
		MemcpyLargeLSFenced(cPageDestLS, cPageEA, cPageSize, g_scDMAPageTag0+cRequPageSlot);
#if defined(DO_SPU_PROFILING)
		NSPU::NDriver::g_PerfStats.pageMemTransferred += cPageSize;
		++NSPU::NDriver::g_PerfStats.pagesTransferred;
		++NSPU::NDriver::g_PerfStats.pageMissesRetMissHandler;
#endif
		*(vec_uint4*)&rState = spu_splats((uint32)0);//reset rState
		rState.curState		= PAGE_STATE_READY;
		rState.curIndex		=	cPageID;
		//update LRU
		g_SPUPageLRUDir = spu_insert(spu_extract(g_PageLRUCounter, 0), g_SPUPageLRUDir, cRequPageSlot);
		g_SPUPageDir		= spu_insert(cPageID, g_SPUPageDir, cRequPageSlot);
		spu_writech(MFC_WrTagMask, cTagMask);//sync all at once
		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
		spu_readch(MFC_RdTagStat);
		asm volatile("bi %0" : "=r"(curLR) : : );
	}
*/

#if defined(CHECK_PAGE_HAZARD)	
.BS1:
	.string	"CodePagingReturnMissHandler: stack pointer underflow"	
.BS2:
	.string	"CodePagingReturnMissHandler: page size < 128 byte"	
.BS4:
	.string	"CodePagingReturnMissHandler: current page slot == repl.slot"	
#endif	
.text
	.align 6
	.global	CodePagingReturnMissHandler
	.type	CodePagingReturnMissHandler, @function
CodePagingReturnMissHandler:
	ila $65, 66051					//0/2		load mask for shuffling ints into all 4 slots
	lqa	$62, _ZN4NSPU17g_pReturnStackTopE			//1/6		load g_pReturnStackTop
	ila	$56, 65535					//0/2		load mask for shorts
	lqa	$64, _ZN4NSPU14g_PageMemLowerE			//1/6		load g_PageMemLower
	il	$19, 16383					//0/2		gen 16 *1024 - 1
	lqa	$63, _ZN4NSPU14g_PageMemUpperE			//1/6		load g_PageMemUpper
	il	$32, 1							//0/2		gen 1											//0/0	
	shufb	$59, $lr, $lr, $65//1/4		const vec_uint4 cLR4 = spu_splats(spu_extract(curLR, 0))
	andi $33, $33, 0				//0/2		nullify rState
	lqa	$60, _ZN4NSPU12g_SPUPageDirE				//1/6		load g_SPUPageDir
	il	$7,66								//0/2		gen MFC_CMD_WORD(0, 0, MFC_GETF_CMD)
	lqa	$39,_ZN4NSPU16g_PageLRUCounterE			//1/6		load g_PageLRUCounter
	ai	$55, $62, -16				//0/2		(vec_int4)((int32)g_pReturnStackTop -16)
	lqd $58, 0($62)					//1/6		const SReturnStackEntry cTopEntry = *g_pReturnStackTop
	clgt $48, $59, $64			//0/2		const vec_uint4 cLowerAddrCmpRes = spu_cmpgt(cLR4, g_PageMemLower)	
	lqa	$37, _ZN4NSPU15g_SPUPageLRUDirE			//1/6		load g_SPUPageLRUDir
	clgt $47, $63, $59			//0/2		const vec_uint4 cUpperAddrCmpRes = spu_cmpgt(g_PageMemUpper, cLR4)
	lqa	$43,_ZN4NSPU14g_CurPageIndexE					//1/6		load g_CurPageIndex
	ila	$35, _ZN4NSPU15g_SPUPageStatesE			//0/2		&g_SPUPageStates[0]
#if defined(SUPP_SN)
	lqa $54, _ZN4NSPU7NDriver13g_sDebugStateE	//1/6  load g_sDebugState
#else	
	lnop										//1/0
#endif	
#if defined(CHECK_PAGE_HAZARD)		
	ila $31, _ZN4NSPU13g_ReturnStackE	//0/2		load &g_ReturnStack[0]
	lnop										//1/0
	ila $30, _ZN4NSPU11NCodePaging10PageHazardEPKc	//0/2		load &PageHazard
	lnop										//1/0
#endif	
	and	$46, $47, $48				//0/2		const vec_uint4 cCurPageMask	= spu_and(cLowerAddrCmpRes, cUpperAddrCmpRes)
	lqa	$59,_ZN4NSPU18g_GlobalSPUPageDirE		//1/6		load g_GlobalSPUPageDir
	ai	$38, $39, 1					//0/2		++g_PageLRUCounter
	lqa $22, _ZN4NSPU13g_SPUPageSizeE				//1/6		load g_SPUPageSize
	and	$53, $58, $56				//0/2		const uint16 cPageID = spu_extract(cTopEntry, 1)
	rotqbyi	$52, $58, 2			//1/4		spu_extract(cTopEntry, 2)
	and	$45, $46, $60				//0/2		spu_and(cCurPageMask, g_SPUPageDir)
	lqa $21, _ZN4NSPU12g_SPUPageMemE				//1/6		load g_SPUPageMem
	ceq	$42, $53, $56				//0/2		(cPageID == SReturnStackEntry::cIsJobPage)
	shufb	$51, $53, $53, $65//1/4		const vec_int4 cDestPageID4	= spu_splats((int)cPageID)
	shli	$28, $53, 3				//0/4		sizeof(SPageInfo) * cPageID
	orx	$45, $45						//1/4		const uint32 cCurPageID = spu_extract(spu_orx(spu_and(cCurPageMask, g_SPUPageIndexMask0)), 0)
	and	$52, $52, $56				//0/2		const uint32 cRequPageSlot = (uint32)spu_extract(cTopEntry, 2)
#if defined(DO_SPU_PROFILING)
	lqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80//1/6	load ((vec_uint4*)&NSPU::NDriver::g_PerfStats)[5]
#else
	lnop										//1/0
#endif	
	NOP											//0/0
	stqa	$38,_ZN4NSPU16g_PageLRUCounterE			//1/6		store ++g_PageLRUCounter
	ceq	$50, $51,$60				//0/2		const vec_uint4 cDestPageMask = spu_cmpeq(cDestPageID4,g_SPUPageDir)
	lnop										//1/0
	ceq	$44, $45,$43 				//0/2		const vec_uint4 cCurPageCmp = spu_cmpeq(g_CurPageIndex, (vec_uint4)cCurPageID)
	lnop										//1/0
	selb $36, $37, $38, $50	//0/2		g_SPUPageLRUDir = spu_sel(g_SPUPageLRUDir, g_PageLRUCounter, cDestPageMask)
	gb	$49, $50						//1/4		spu_gather(cDestPageMask)
	or	$42, $42, $44				//0/2		(cPageID == SReturnStackEntry::cIsJobPage) || (spu_extract(cCurPageCmp, 0) != 0)
	hbr .SimpleFastRet, $lr	//1/10  branch hint for expected simple return destination (link register cont.)
	selb $40, $55, $62, $44 //0/2	  g_pReturnStackTop = (SReturnStackEntry*)(void*)spu_extract(spu_sel((vec_int4)((int32)g_pReturnStackTop -16), (vec_int4)(int32)g_pReturnStackTop, cCurPageCmp), 0)
	stqa $36, _ZN4NSPU15g_SPUPageLRUDirE			//1/6		store updated g_SPUPageLRUDir (overwritten if !cSimpleFastRet)
	selb $51, $51, $43, $44	//0/2		g_CurPageIndex = spu_sel(cDestPageID4,g_CurPageIndex,cCurPageCmp)
	lqx	$27, $59, $28				//1/6		load g_GlobalSPUPageDir[cPageID]
	clz	$49, $49						//0/2		spu_cntlz(spu_gather(cDestPageMask))
	stqa $40, _ZN4NSPU17g_pReturnStackTopE			//1/6		store updated g_pReturnStackTop
	shli $15, $52, 2				//0/4		cRequPageSlot * 4 for insertion mask for g_SPUPageLRUDir
#if defined(DO_SPU_PROFILING)	
	rotqbyi $39, $29, 0			//1/4		copy orig. contents of NSPU::NDriver::g_PerfStats
#else	
	lnop										//1/0
#endif	
	ai	$49, $49, -28				//0/2		const uint32 cDestPageSlot	= spu_extract(spu_add(spu_cntlz(spu_gather(cDestPageMask)), -28), 0)
	stqd	$sp,-80($sp)			//1/6		store stack pointer for cellDmaLargeCmd
	andi $26, $28, 8				//0/2		gen rotation number to access g_GlobalSPUPageDir[cPageID].ea
	stqd $lr, -32($sp)			//1/6		store link register for cellDmaLargeCmd
	ceq	$41, $49, $52				//0/2		(cDestPageSlot == cRequPageSlot)
	stqa $51,	_ZN4NSPU14g_CurPageIndexE					//1/6		store g_CurPageIndex = (vec_uint4)spu_extract(cDestPageID4, 0)
	ai	$25, $26, 4					//0/2		gen rotation number to access g_GlobalSPUPageDir[cPageID].size
	rotqby	$62, $27, $26		//1/4		const uint32 cPageEA	= g_GlobalSPUPageDir[cPageID].ea
	or	$42, $42, $41				//0/2		const bool cSimpleFastRet = (complete now)
#if defined(CHECK_PAGE_HAZARD)
	NOP											//0/0
	brnz $42, .SkipPageHazardChecks		//1/4   perform hazard check not in case of job return
	ai $31, $31, -1					//0/2		subtract 1 to compare for greater than
	rotqbyi	$16, $3, 0			//1/4		save $3	
	cgt $31, $40, $31				//0/2		(g_pReturnStackTop >= &g_ReturnStack[0])
	lnop										//1/0
	ila $3, .BS1						//0/2		put string into parameter register for PageHazard()
	biz $31, $30						//1/4		if((uint32)g_pReturnStackTop < &g_ReturnStack[0]) PageHazard()
	#now check if replacement slot equals current slot
	ila	$31,66051						//0/2   load mask for word shuffle(splats)
	shufb	$31,$43,$43,$31		//1/4		spu_splats(spu_extract(g_CurPageIndex, 0))
	ceq $31, $60, $31				//0/2   find out current slot (mask, gather, clz)
	gb $31, $31							//1/4
	clz $31, $31						//0/2
	ai $31, $31, -28				//0/2
	ceq $31, $52, $31				//0/2		(current page slot == replSlot)?
	ila $3, .BS4						//0/2		put string into parameter register for PageHazard()
	binz $31, $30						//1/4		if((current page slot == replSlot)) PageHazard()
	ori	$3, $16, 0					//0/2		restore $3
.SkipPageHazardChecks:	
#endif
	cwx	$14, $sp, $15				//1/4		complete insertion mask for g_SPUPageLRUDir
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$16, 4($sp)					//1/4		create insertion mask for returnMissHandlerCalls
	NOP											//0/0
	rotqbyi	$2, $29, 2			//1/4		rotate NSPU::NDriver::g_PerfStats.returnMissHandlerCalls into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.returnMissHandlerCalls
	shufb	$29, $2, $29, $16	//1/4		reinsert returnMissHandlerCalls into the proper pos
	ori $39, $29, 0					//0/2		(STALL 3) copy new original contents	
	stqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80		//1/6 store updated g_PerfStats
#endif	
	mpyh $11, $22, $52			//0/7		g_SPUPageSize * cRequPageSlot
	rotqby	$17, $27, $25		//1/4		const uint32 cPageSize	= g_GlobalSPUPageDir[cPageID].size
	mpyu $20, $22, $52			//0/7		g_SPUPageSize * cRequPageSlot
	brz $42, .ReplacePages  //1/4		if(!cSimpleFastRet) replace page
#if defined(SUPP_SN)			//modelled so that is does not branch if no debugging is enabled and cSimpleFastRet is true	
	brnz	$54, .SPU_DEBUG_BREAK_RET	//1/4		if(IsDebugEnabled()) snPause()
#endif	
.SimpleFastRet:
#if defined(CHECK_PAGE_HAZARD)
	NOP											//0/0
	brz $42,.ReplacePages   //1/4	  skip test if(!cSimpleFastRet)
  lqd $18, 0($lr)					//1/6		load jump destination to check for non 0
	brnz $18,.JumpTestPassed3//1/4
	NOP											//0/0
  stop 255								//1/4		stop debugger if dest.,is zero
.JumpTestPassed3:  
#endif
	binz $42, $lr						//1/4		if(cSimpleFastRet) branch to link register cont.

.ReplacePages:
	ai $18, $52, 8					//0/2		g_scDMAPageTag0 + cRequPageSlot	
	shli $34, $52, 4				//0/4		sizeof(SPageState) * cRequPageSlot	
	selb $60,$60,$50,$50		//0/2		where dest.page matches, replace by -1 to avoid multiple page matches next miss
	lnop										//1/0
	il $63, 16384					  //0/2		load 16*1024
	shufb	$13, $38, $37, $14//1/4		g_SPUPageLRUDir = spu_insert(spu_extract(g_PageLRUCounter, 0), g_SPUPageLRUDir, cRequPageSlot)
	shl	$65,$32,$18					//0/4		const uint32 cTagMask	= (1<<(g_scDMAPageTag0 + cRequPageSlot))
	rotqbyi	$16, $3, 0			//1/4		save $3		
	a $20, $11, $20					//0/2   finalize g_SPUPageSize * cRequPageSlot
	shufb	$12, $53, $60, $14//1/4		g_SPUPageDir = spu_insert(cPageID, g_SPUPageDir, cRequPageSlot)
#if defined(CHECK_PAGE_HAZARD)	
	clgti $47, $17, 127			//0/2		(cPageSize > 127)
	lnop										//1/0
#endif	
	ori $60, $17, 0					//0/2		int sizeLeft = (int)cSize
	lnop										//1/0	
	a $3, $20, $21					//0/2		const uint32 cPageDestLS = (uint32)(g_SPUPageMem + g_SPUPageSize * cRequPageSlot)
	stqa	$13, _ZN4NSPU15g_SPUPageLRUDirE			//1/6		store updated g_SPUPageLRUDir
	a	$11, $35, $34					//0/2		&g_SPUPageStates[cRequPageSlot]
	lnop										//1/0	
	ori $61, $3, 0					//0/2   uint32 curDest = (uint32)dest
	lnop										//1/0	
#if defined(CHECK_PAGE_HAZARD)		
	ila $3, .BS2						//0/2		put string into parameter register for PageHazard()
	biz $47, $30						//1/4		if(cPageSize <= 127) PageHazard()
#endif
	NOP											//0/0	
	chd	$31, 2($11)					//1/4		generate controls for updating rState.curState	
	cgt $8,$60,$63					//0/2		(sizeLeft>16*1024)?	(for first iteration)
#if defined(DO_SPU_PROFILING)
	cwd	$22,0($sp)					//1/4		gen insertion mask for g_PerfStats.pageMemTransferred
	a	$29, $29, $17					//0/2		NSPU::NDriver::g_PerfStats.pageMemTransferred += cPageSize
	shufb	$29,$29,$39,$22		//1/4		(STALL 3) mask non affected bytes
	ori $39, $29, 0					//0/2		(STALL 3) copy new original contents
#endif
	stqa	$12, _ZN4NSPU12g_SPUPageDirE				//1/6		store updated g_SPUPageDir
//MemcpyLS
	ori	$3, $16, 0					//0/2		restore $3	
	wrch	$ch20, $18				//1/6		si_wrch(MFC_TagID,si_from_uint(g_scDMAPageTag0+cRequPageSlot))
.MemcpyLargeLSLoop:	
	selb $8,$60,$63,$8			//0/2		(sizeLeft>16*1024)?16*1024 : sizeLeft)
	wrch $ch18, $62					//1/6		si_wrch(MFC_EAL,si_from_uint(cPageEA))
	sf	$60,$63,$60					//0/2		sizeLeft	-= 16*1024
	lnop										//0/0
	a $62, $62, $63					//0/2		curSource	+= 16*1024
	wrch $ch16, $61					//1/6		si_wrch(MFC_LSA, (int)curDest)
	cgti $9, $60, 0					//0/2		(sizeLeft > 0)?
	wrch	$ch19, $8					//1/6		si_wrch(MFC_Size,si_from_uint(cPageSize))
	cgt $8,$60,$63					//0/2		(sizeLeft>16*1024)?
	wrch	$ch21, $7					//1/6		si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD))
	a $61, $61, $63					//0/2		curDest	+= 16*1024
	brnz $9, .MemcpyLargeLSLoop		//1/4		while(sizeLeft > 0)
.AfterMemcpy:	
	il $16, 2								//0/2		gen MFC_TAG_UPDATE_ALL
	chd	$55, 0($11)					//1/4		generate controls for updating rState.curIndex	
	NOP											//0/0
	shufb	$33,$16,$33,$31		//1/4		rState.curState		= PAGE_STATE_READY	
//	NOP											//0/0		
	shufb	$33, $53, $33, $55//1/4		(STALL 3) rState.curIndex	=	cPageID	
//	NOP											//0/0
	stqd $33, 0($11)				//1/6		(STALL 3) store g_SPUPageStates[cRequPageSlot]	
	NOP											//0/0		
	wrch $ch22,$65					//1/6		spu_writech(MFC_WrTagMask, cTagMask)
	NOP											//0/0
	wrch $ch23,$16					//1/6		spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
#if defined(DO_SPU_PROFILING)
	NOP											//0/0
	chd	$16, 12($sp)				//1/4		create insertion mask for pageMissesRetMissHandler
	NOP											//0/0
	rotqbyi	$2, $29, 10			//1/4		rotate NSPU::NDriver::g_PerfStats.pageMissesRetMissHandler into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.pageMissesRetMissHandler
	shufb	$29, $2, $29, $16	//1/4		reinsert pageMissesRetMissHandler into the proper pos
	NOP											//0/0
	chd	$16, 14($sp)				//1/4		create insertion mask for pagesTransferred
	NOP											//0/0
	rotqbyi	$2, $29, 12			//1/4		rotate NSPU::NDriver::g_PerfStats.pagesTransferred into pref.slot
	ahi	$2, $2, 1						//0/2		++NSPU::NDriver::g_PerfStats.pagesTransferred
	shufb	$29, $2, $29, $16	//1/4		reinsert pagesTransferred into the proper pos
#endif	
//	NOP											//0/0	
	rdch $10,$ch24					//1/4		spu_readch(MFC_RdTagStat)
//	NOP											//0/0	
#if defined(DO_SPU_PROFILING)	
	stqa	$29, _ZN4NSPU7NDriver11g_PerfStatsE+80		//1/6 store updated g_PerfStats
	NOP											//0/0	
#endif
#if defined(CHECK_PAGE_HAZARD)
  lqd $29, 0($lr)					//1/6		load jump destination to check for non 0
	brnz $29,.JumpTestPassed0 //1/4
	NOP											//0/0
  stop 255								//1/4		stop debugger if dest.,is zero
.JumpTestPassed0:  
#endif
#if defined(SUPP_SN)
	brnz	$54, .SPU_DEBUG_BREAK_RET	//1/4		if(IsDebugEnabled()) snPause()
	NOP											//0/0		
#endif	
.RetAfterMemcpy:
	bi $lr									//1/4		branch to link register cont.
#if defined(SUPP_SN)
.SPU_DEBUG_BREAK_RET:
	stqa $lr,261616					//1/6
	STOPD										//1/4		custom snPause()
	NOP											//0/0
	bi	$lr									//1/4		branch to link register cont.
#endif

	.size	CodePagingReturnMissHandler, .-CodePagingReturnMissHandler


		//from asm code generated SetActivePages
		//does only use registers 29..39,46,47,61,64..69, to be called from inside CodePagingCallMissHandler
		//$52 contains ls address of last written page (in case of just one page, this is the dest.address)
		//$48 contains SPageState& rState = g_SPUPageStates[cDestPageSlot] of last written page
		//$53 contains tag of last written page
		//$39 contains store replSlot * sizeof(SPageState) of last written page
/*	C++ version:

		//sets the IDs of the current 1..4 active pages (1 set at least (current))
		//	the first index marks the current page, any following -1 means no page
		//	returns true if a dma transfer has been started
		const bool SetActivePages(const vec_uchar16 cIDv0, const vec_uchar16 cIDv1, const vec_uchar16 cIDv2, const vec_uchar16 cIDv3)
		{
			#define INVALID_PAGEID 65535
			const int cID0 = (int)spu_extract(spu_gather(cIDv0),0);
			const int cID1 = (int)spu_extract(spu_gather(cIDv1),0);
			const int cID2 = (int)spu_extract(spu_gather(cIDv2),0);
			const int cID3 = (int)spu_extract(spu_gather(cIDv3),0);
			const uint32 cSPUPageSize = g_SPUPageSize;
			const SPageInfo *const __restrict rGlobalSPUPageDir = g_GlobalSPUPageDir;
			uint8* const __restrict pSPUPageMem = g_SPUPageMem;
			SPageState*const __restrict pSPUPageStates = g_SPUPageStates;
#if defined(CHECK_PAGE_HAZARD)
			if(cID0 == INVALID_PAGEID)
				PageHazard("SetActivePages: current ID has ID < 0");
			if((cID2 != INVALID_PAGEID || cID3 != INVALID_PAGEID ) && cSPUPageSize == 64 * 1024)
				PageHazard("Wrong usage for SetActivePages in overlay mode(2x64 KB)\n");
#endif
			bool dmaStarted = false;
			//check if ID is present and if not, check if an empty page slot is available 
			//if not, replace one page slot and start streaming of it
			const uint32 cDecrVal = spu_readch(SPU_RdDec);//need to track streaming start
			const int cIDs[scMaxSPUPageCount] = {cID1, cID2, cID3, INVALID_PAGEID};
			vec_int4 curSPUPageDir = g_SPUPageDir;
			vec_uint4 curSPUPageLRUDir = g_SPUPageLRUDir;
			vec_uint4 curPageLRUCounter = g_PageLRUCounter;
			curPageLRUCounter = spu_add(curPageLRUCounter, 1);//IncrPageLRU()
			int i=0;
			int curID = cID0;
			do
			{
				const vec_int4 cSplatID = spu_splats(curID);
				IF(spu_extract(spu_orx(spu_cmpeq(curSPUPageDir, cSplatID)), 0) == 0, 1)//IF(!IsPagePresent(cSplatID),1)
				{
					uint8 replSlot = spu_extract(spu_add(spu_cntlz(spu_gather(spu_cmpeq(curSPUPageDir, spu_splats(-1)))), -28), 0);
					//get replacement page, will not be current page since LRU is highest (just updated)
					IF(replSlot == NO_EMPTY_PAGE_SLOT, 1)
						replSlot = GetReplIndex(curSPUPageLRUDir);
#if defined(CHECK_PAGE_HAZARD)
					if(cSPUPageSize == 64 * 1024 && replSlot > 1)
						PageHazard("Got reserved replacement slot for SetActivePages in overlay mode(2x64 KB)\n");
#endif
					//stream new page
					const SPageInfo& crPageInfo = rGlobalSPUPageDir[curID];
					const uint32 cPageEA				= crPageInfo.ea;
					const uint32 cPageSize			= crPageInfo.size;
					const uint32 cPageDestLS		= (uint32)(pSPUPageMem + cSPUPageSize * replSlot);
					MemcpyLargeLS((TAddrLS)cPageDestLS, cPageEA, cPageSize, g_scDMAPageTag0+replSlot, true);
					dmaStarted = true;
#if defined(DO_SPU_PROFILING)
					NSPU::NDriver::g_PerfStats.pageMemTransferred += cPageSize;
					++NSPU::NDriver::g_PerfStats.pagesTransferred;
					++NSPU::NDriver::g_PerfStats.pageMisses;
#endif

					SPageState& __restrict rSPUPageStates		= pSPUPageStates[replSlot];
					rSPUPageStates.curState			= PAGE_STATE_STREAMING;
					rSPUPageStates.curIndex			= curID;
					rSPUPageStates.transDecrEnd	= cDecrVal - (cPageSize >> BYTES_PER_DECR_TICK_SHIFT);
					curSPUPageDir = spu_insert(curID, curSPUPageDir, replSlot);//SetPageIndex(curID, replSlot)
				}
				//increment LRU
				curSPUPageLRUDir = spu_sel(curSPUPageLRUDir, curPageLRUCounter, spu_cmpeq(cSplatID, curSPUPageDir));//UpdatePageLRUByID(cSplatID);
				curID		= cIDs[i++];
			}
			while(curID != INVALID_PAGEID);
			curPageLRUCounter = spu_add(curPageLRUCounter, 1);//IncrPageLRU()
			g_SPUPageDir			= curSPUPageDir;
			g_SPUPageLRUDir		= curSPUPageLRUDir;
			g_PageLRUCounter	= curPageLRUCounter;
			return dmaStarted;
		}
*/
#if defined(CHECK_PAGE_HAZARD)
.BS5:
	.string	"SetActivePages: repl. current page slot"
.BS6:
	.string	"SetActivePages: current page ID has ID < 0"
.BS7:
	.string	"Wrong usage for SetActivePages in overlay mode(2x64 KB)"
.BS8:
	.string	"Got reserved replacement slot for SetActivePages in overlay mode(2x64 KB)"
#endif	
	.align	6
	.global	SetActivePages
	.type	SetActivePages, @function
SetActivePages:
	gbb	$66,$4					 //1/4		const int cID1 = (int)spu_extract(spu_gather(cIDv1),0)
	gbb	$38,$5					 //1/4		const int cID2 = (int)spu_extract(spu_gather(cIDv2),0)
	gbb	$39,$6					 //1/4		const int cID3 = (int)spu_extract(spu_gather(cIDv3),0)
	gbb	$64,$3					 //1/4		const int cID0 = (int)spu_extract(spu_gather(cIDv0),0)
#if defined(CHECK_PAGE_HAZARD)
	ori $29,$3,0				 //0/2		save $3
  ila $34, 65535			 //0/2    load INVALID_PAGEID
	ceq $37,$64,$34			 //0/2    cID0 == INVALID_PAGEID?
	lnop								 //1/0
	ila $30, _ZN4NSPU11NCodePaging10PageHazardEPKc	//0/2		load &PageHazard		
	lnop								 //1/0
	ila $3, .BS6				 //0/2		put string into parameter register for PageHazard()	
	binz $37, $30				 //1/4    if(cID0 == INVALID_PAGEID)PageHazard("SetActivePages: current page ID has ID < 0")
	NOP									 //0/0
	lqr	$67,_ZN4NSPU13g_SPUPageSizeE		//1/6		const uint32 cSPUPageSize = g_SPUPageSize
	ceq $37,$38,$34			 //0/2    cID2 == INVALID_PAGEID?	
	ceq $69,$39,$34			 //0/2    cID3 == INVALID_PAGEID?	
	and $69, $69,$37		 //0/2    0 if (cID2 |= INVALID_PAGEID) && (cID3 |= INVALID_PAGEID)
	ceq $37,$67,$34			 //0/2    (cSPUPageSize == 64 * 1024)?
	orc $69,$69,$37			 //0/2    is 0 if (cSPUPageSize == 64 * 1024) && ((cID2 |= INVALID_PAGEID) || (cID3 |= INVALID_PAGEID))
	ila $3, .BS7				 //0/2		put string into parameter register for PageHazard()	
	NOP									 //0/0
	biz $69, $30				 //1/4    if((cID2 != INVALID_PAGEID || cID3 != INVALID_PAGEID ) && cSPUPageSize == 64 * 1024) PageHazard("Wrong usage for SetActivePages in overlay mode(2x64 KB)\n")
	ori	$3, $29, 0			 //0/2		restore $3	
#endif	
	fsmbi	$2,3           //1/4    cIDs[scMaxSPUPageCount] = {INVALID_PAGEID, INVALID_PAGEID, INVALID_PAGEID, INVALID_PAGEID}		
	cwd	$4,0($sp)        //1/4    gen word mask for insertion of cID1
	cwd	$37,4($sp)       //1/4    gen word mask for insertion of cID2
	cwd	$3,8($sp)        //1/4    gen word mask for insertion of cID3
	lqr	$34,_ZN4NSPU16g_PageLRUCounterE		//1/6		vec_uint4 curPageLRUCounter = g_PageLRUCounter	
	shufb	$4,$66,$2,$4   //1/4    cIDs[0] = cID1	
	stqd	$sp,-48($sp)	 //1/6		store stack pointer as expected by ABI		
	rdch	$30,$ch8			 //1/20	  const uint32 cDecrVal = spu_readch(SPU_RdDec)
	ai	$sp,$sp,-48			 //0/2		increment stack pointer		
	lqr	$67,_ZN4NSPU12g_SPUPageDirE    //1/6  load g_SPUPageDir	
	il	$65,0						 //0/2		&cIDs[i]
	shufb	$5,$38,$4,$37  //1/4    cIDs[1] = cID2	
	ai	$34,$34,1        //0/2    curPageLRUCounter = spu_add(curPageLRUCounter, 1)		
	lqr	$69,_ZN4NSPU15g_SPUPageLRUDirE //1/6  load g_SPUPageLRUDir
	il	$68,0            //0/2    dmaStarted = false	
	shufb	$6,$39,$5,$3   //1/4    cIDs[2] = cID3
	ila	$31,66051        //0/2    load mask for word shuffle(splats)
	stqd	$6,32($sp)     //1/6    store cIDs
.IDLoop:
	ceqi	$47,$67,-1     //0/2    spu_cmpeq(curSPUPageDir, spu_splats(-1))
	rotqbyi	$33,$69,4    //1/4    const vec_uint4 cVal1	= spu_rlqwbyte(g_SPUPageLRUDir, 4)
	NOP									 //0/0
	shufb	$66,$64,$64,$31//1/4    const vec_int4 cSplatID = spu_splats(curID)
	il	$37,1            //0/2    gen 1 for GetReplIndex
	gb	$46,$47          //1/4    spu_gather(spu_cmpeq(curSPUPageDir, spu_splats(-1)))
	il	$29,2            //0/2    gen 2 for GetReplIndex
	rotqbyi	$38,$69,8    //1/4    const vec_uint4 cVal2	= spu_rlqwbyte(g_SPUPageLRUDir, 8)
	clgt	$36,$69,$33    //0/2    spu_cmpgt(cVal0, cVal1)
	rotqbyi	$39,$69,12   //1/4    const vec_uint4 cVal3	= spu_rlqwbyte(g_SPUPageLRUDir, 12)
	ceq	$6,$67,$66       //0/2    spu_cmpeq(curSPUPageDir, cSplatID)	
	clz	$2,$46           //0/2    spu_cntlz(spu_gather(spu_cmpeq(curSPUPageDir, spu_splats(-1)))) 
	clgt	$31,$38,$39    //0/2    spu_cmpgt(cVal2, cVal3)
	orx	$4,$6            //1/4    spu_orx(spu_cmpeq(curSPUPageDir, cSplatID))
	ai	$2,$2,-28        //0/2    spu_add(spu_cntlz(spu_gather(spu_cmpeq(curSPUPageDir, spu_splats(-1)))), -28)
	lqr	$35,_ZN4NSPU13g_SPUPageSizeE		//1/6		const uint32 cSPUPageSize = g_SPUPageSize	
	il	$32,3            //0/2    gen 3 for GetReplIndex
	ceqbi	$68,$2,4       //0/2    (replSlot == NO_EMPTY_PAGE_SLOT)?
	selb	$33,$69,$33,$36//0/2    const vec_uint4 cCmpSelRes01			= spu_sel(cVal0, cVal1, cCmpVec01)
	brnz	$4,.PostPageNotPresent   //1/10  if(!IsPagePresent(cSplatID))
	selb	$38,$38,$39,$31//0/2    const vec_uint4 cCmpIndexRes01		= spu_sel(spu_promote((unsigned int)0,0), spu_promote((unsigned int)1,0), cCmpVec01);//spu_and(cCmpVec01, 1
	xsbh	$68,$68        //0/2    sign extend (replSlot == NO_EMPTY_PAGE_SLOT)?
	selb	$39,$4,$37,$36 //0/2    const vec_uint4 cCmpSelRes23			= spu_sel(cVal2, cVal3, cCmpVec23)
	lqr	$36,_ZN4NSPU18g_GlobalSPUPageDirE		//1/6		const SPageInfo *const __restrict rGlobalSPUPageDir = g_GlobalSPUPageDir
	clgt	$38,$33,$38    //0/2    spu_cmpgt(cCmpSelRes01, cCmpSelRes23)
	brhz	$68,.PostNoEmptyPageSlot	//1/4		if(replSlot == NO_EMPTY_PAGE_SLOT)
	selb	$29,$29,$32,$31//0/2    const vec_uint4 cCmpIndexRes23		= spu_sel(spu_promote((unsigned int)2,0), spu_promote((unsigned int)3,0), cCmpVec23)
	selb	$2,$39,$29,$38 //0/2    const vec_uint4 cCmpIndexRes0123	= spu_sel(cCmpIndexRes01, cCmpIndexRes23, cCmpVec0123)
.PostNoEmptyPageSlot:
	shli	$39,$64,3      //0/4    calc. offset for &rGlobalSPUPageDir[curID]
	hbrr	.DoMemcpyLargeLSLoopWhileCond,.DoMemcpyLargeLSLoop
	il	$5,4             //0/2    offsetof(size, SPageInfo)
	andi	$68,$2,0x00ff  //0/2    finalize uint8 replSlot (byte masking)
#if defined(CHECK_PAGE_HAZARD)
	lqa $37, _ZN4NSPU14g_CurPageIndexE	//1/6		load g_CurPageIndex
	ori $29,$3,0				 //0/2		save $3
	lqa	$4, _ZN4NSPU12g_SPUPageDirE			//1/6		load g_SPUPageDir	
	ila	$38,66051				 //0/2   load mask for word shuffle(splats)
	shufb	$38,$37,$37,$38//1/4		spu_splats(spu_extract(g_CurPageIndex, 0))
	ceq $38, $4, $38		 //0/2   find out current slot (mask, gather, clz)
	gb $38, $38					 //1/4
	clz $38, $38				 //0/2
	ai $38, $38, -28		 //0/2
	ceq $38,$68, $38		 //0/2		(current page slot == replSlot)?
	ila $37, _ZN4NSPU11NCodePaging10PageHazardEPKc	//0/2		load &PageHazard	
	lnop								 //1/0
	ila $3, .BS5				 //0/2		put string into parameter register for PageHazard()
	binz $38, $37				 //1/4		if((current page slot == replSlot)) PageHazard()
  ila $61, 65535			 //0/2    load 64*1024
	lqr	$38,_ZN4NSPU13g_SPUPageSizeE		//1/6		const uint32 cSPUPageSize = g_SPUPageSize
	cgti $4,$68,1				 //0/2    replSlot > 1?
	ceq $38, $38,$61		 //0/2    cSPUPageSize == 64*1024?
	and $38,$38,$4			 //0/2    (replSlot > 1) && (cSPUPageSize == 64*1024)?
	ila $3, .BS8				 //0/2		put string into parameter register for PageHazard()	
	NOP									 //0/0
	binz $38, $37				 //1/4		if(cSPUPageSize == 64 * 1024 && replSlot > 1) PageHazard("Got reserved replacement slot for SetActivePages in overlay mode(2x64 KB)\n")
	ori	$3, $29, 0			 //0/2		restore $3
	lnop								 //1/0	
#endif
	a	$37,$36,$39        //0/2    &rGlobalSPUPageDir[curID]
	lqx	$38,$36,$39      //1/6    const uint32 cPageEA = crPageInfo.ea
	mpyh	$61,$35,$68    //0/7    cSPUPageSize * replSlot(high word)
	lqr	$4,_ZN4NSPU12g_SPUPageMemE		//1/6		uint8* const __restrict pSPUPageMem = g_SPUPageMem
	mpyu	$39,$35,$68    //0/7    cSPUPageSize * replSlot(low word)
	lqx	$5,$37,$5        //1/6    const uint32 cPageSize = crPageInfo.size
	ai	$29,$37,4        //0/2    gen rotate amount for cPageEA
	ai	$53,$68,8        //0/2    g_scDMAPageTag0+replSlot
	rotqby	$6,$38,$37   //1/4    rotate cPageEA into pref.slot
	rotqby	$29,$5,$29   //1/4    rotate cPageSize into pref.slot
	a	$5,$61,$39         //0/2    add high and low word for (cSPUPageSize * replSlot)
	wrch	$ch20,$53      //1/4		si_wrch(MFC_TagID,si_from_uint(cTagID))	
	a	$5,$4,$5           //0/2    const uint32 cPageDestLS = (uint32)(pSPUPageMem + cSPUPageSize * replSlot)
	lnop								 //1/0
#if defined(CHECK_PAGE_HAZARD)
	//zero page memory beforehand: LS_dest=$5, size=$35
	sf $4,$29,$35				//0/2			iterLeft = (size - cPageSize)
	a $52, $29,$5				//0/2			curLS =  cPageDestLS + cPageSize
	andi $38, $38, 0		//0/2			gen cZero
	rotmi $4, $4, -4		//0/2			iterLeft = (size - cPageSize)/16
.ZeroPageLoopStart:
	brz $4, .ZeroPageEnd//1/4			if(iterLeft == 0)break
	stqd $38, 0($52)		//1/6			*(vec_uint4*)curLS = cZero
	ai	$52, $52, 16		//0/2			curLS += 16
	NOP
	ai	$4, $4, -1			//0/2			--iterLeft
	br .ZeroPageLoopStart//1/4		continue
.ZeroPageEnd:
#endif	
	ori	$4,$29,0         //0/2    int sizeLeft = (int)cSize
	lnop								 //1/0
	ori $52,$5,0				 //0/2    store last written ls address (expected by CPMH)
	lnop								 //1/0
#if defined(DO_SPU_PROFILING)	
	lqr	$38,_ZN4NSPU7NDriver11g_PerfStatsE+80	//1/6   load &g_PerfStats.pageMemTransferred
	cwd	$2,0($sp)				 //1/4		gen insertion mask for g_PerfStats.pageMemTransferred
	chd	$37,14($sp)			 //1/4		gen insertion mask for g_PerfStats.pagesTransferred
	chd	$39,8($sp)			 //1/4		gen insertion mask for g_PerfStats.pageMisses
	a	$48,$38,$29				 //0/2    NSPU::NDriver::g_PerfStats.pageMemTransferred += cPageSize;
	shufb	$38,$48,$38,$2 //1/4		insert pageMemTransferred
	rotqbyi	$2,$38,12    //1/4    rotate g_PerfStats.pagesTransferred into pref.slot
	lnop							   //1/0
	ahi	$2,$2,1          //1/4		++NSPU::NDriver::g_PerfStats.pagesTransferred
	shufb	$37,$2,$38,$37 //1/4    insert pagesTransferred
	lnop							   //1/0	    
	rotqbyi	$38,$37,6    //1/4    rotate g_PerfStats.pageMisses into pref.slot
	ahi	$38,$38,1        //1/4    ++NSPU::NDriver::g_PerfStats.pageMisses
	shufb	$2,$38,$37,$39 //1/4    insert pageMisses
	NOP									 //0/0
	stqr	$2,_ZN4NSPU7NDriver11g_PerfStatsE+80 //1/4    store NSPU::NDriver::g_PerfStats
#endif	
.DoMemcpyLargeLSLoop: 
	il	$38,16384        //0/2    gen 16*1024
	wrch	$ch16,$5       //1/4    si_wrch(MFC_LSA,si_from_uint(curDest))
	il	$2,66            //0/2    MFC_GET_CMD		
	wrch	$ch18,$6       //1/4    si_wrch(MFC_EAL,si_from_uint(curSource))
	cgt	$37,$4,$38       //0/2    cond = (sizeLeft>16*1024)?
	a	$6,$6,$38          //0/2    curSource	+= 16*1024	
	selb	$37,$4,$38,$37 //0/2    cond?16*1024 : sizeLeft)
	sf	$4,$38,$4        //0/2    sizeLeft	-= 16*1024		
	cgti	$39,$4,0       //0/2    (sizeLeft > 0)	
	wrch	$ch19,$37      //1/4    si_wrch(MFC_Size,si_from_uint((sizeLeft>16*1024)?16*1024 : sizeLeft))
	a	$5,$5,$38          //0/2    curDest	+= 16*1024	
	wrch	$ch21,$2       //1/4    si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD))
	NOP									 //0/0
.DoMemcpyLargeLSLoopWhileCond:
	brnz	$39,.DoMemcpyLargeLSLoop //1/4   while(sizeLeft > 0)
	shli	$37,$68,4      //0/4   replSlot * sizeof(SPageState)
	hbrr	.ContIDLoop,.IDLoop    //1/10   branch hint for while(curID != INVALID_PAGEID)
	ila	$4,_ZN4NSPU15g_SPUPageStatesE  //1/4  load &g_SPUPageStates
	chd	$6,2($sp)        //1/4    generate insertion mask for rSPUPageStates.curState
	shli	$5,$68,2       //0/4    sizeof(uint32) * replSlot
	cwd	$38,8($sp)       //1/4    generate insertion mask for rSPUPageStates.transDecrEnd
	rotmi	$39,$29,-8     //0/4    (cPageSize >> BYTES_PER_DECR_TICK_SHIFT) 
	lqx	$2,$37,$4        //1/6    load pSPUPageStates[replSlot]	
	il	$3,1             //0/2    gen PAGE_STATE_STREAMING
	chx	$36,$37,$4       //1/4    gen shuffle mask for pSPUPageStates[replSlot]
	il	$68,1            //0/2    dmaStarted = true	
	cwx	$5,$sp,$5        //1/4    gen insert mask for spu_insert(curID, curSPUPageDir, replSlot)
	sf	$61,$39,$30      //0/2    cDecrVal - (cPageSize >> BYTES_PER_DECR_TICK_SHIFT)
	shufb	$3,$3,$2,$6    //1/4    rSPUPageStates.curState	= PAGE_STATE_STREAMING
	shufb	$67,$64,$67,$5 //1/4    curSPUPageDir = spu_insert(curID, curSPUPageDir, replSlot)
	shufb	$39,$64,$3,$36 //1/4    rSPUPageStates.curIndex = curID
	ceq	$6,$67,$66       //0/2    spu_cmpeq(cSplatID, curSPUPageDir)
	shufb	$61,$61,$39,$38//1/4    update mem location of pSPUPageStates[replSlot] (update rSPUPageStates.transDecrEnd)
	ori $39, $37,0			 //0/2    store replSlot * sizeof(SPageState)
	ori $48,$61,0				 //0/2    move pSPUPageStates[replSlot] into $48 as expected by CodePageMissHandler
	NOP									 //0/0
	stqx	$61,$37,$4		 //1/6    store pSPUPageStates[replSlot]
.PostPageNotPresent:
	ai	$37,$65,32			 //0/2		gen address for &cIDs[i]
	ila	$38,65535				 //0/2		gen INVALID_PAGEID
	selb	$69,$69,$34,$6 //0/2	  curSPUPageLRUDir = spu_sel(curSPUPageLRUDir, curPageLRUCounter, spu_cmpeq(cSplatID, curSPUPageDir))
	lqx	$2,$37,$sp			 //1/6    load cIDs[i]
	ila	$31,66051        //0/2    load mask for word shuffle(splats)	
	rotqby	$64,$2,$65	 //1/4    rotate cIDs[i++] into preferred slot
	ai	$65,$65,4				 //0/2		&cIDs[i+1] (address op for i++)
	ceq	$3,$64,$38			 //0/2		curID == INVALID_PAGEID
	NOP
.ContIDLoop:
	brz	$3,.IDLoop			 //1/6		while(curID != INVALID_PAGEID)
	ai	$2,$34,1				 //0/2		curPageLRUCounter = spu_add(curPageLRUCounter, 1)
	stqr	$69,_ZN4NSPU15g_SPUPageLRUDirE		//1/6	g_SPUPageLRUDir = curSPUPageLRUDir
	andi	$3,$68,1			 //0/2    gen dmaStarted
	stqr	$67,_ZN4NSPU12g_SPUPageDirE				//1/6	g_SPUPageDir = curSPUPageDir
	ai	$sp,$sp,48			 //0/2		restore stack pointer as expected by ABI
	stqr	$2,_ZN4NSPU16g_PageLRUCounterE		//1/6		g_PageLRUCounter = curPageLRUCounter
	NOP									 //0/0
	bi	$lr							 //1/4		return dmaStarted
	
	.size	SetActivePages, .-SetActivePages
	
#endif //PS3
