/*
	implementation of spu driver and its functions
	mechanism: - main is never exiting loop, it grabs jobs by polling and locking the jobs from PPU job queue (1 shared by all SPUs)
			  - each jobs have a certain order of data to be transferred to LS
				- can have dependent jobs to call next, so a certain order is guaranteed without having the need to join or wait on some sync mechanism
				- each job can have multiple input/output packets (memory locations must not depend on the previous jobs)
				- for all kind of callbacks, check job state variable first, this is sync't against output DMA
				- stack when entered main is 752 bytes below the top address -> temp storage
				- it copies the job info block to local store, processes it by dma in the input memory mapping, job code and data 
				- it starts the job by calling the Execute function (always the entry point) by passing the pointer to the parameter area 
	polling behavior:
				- SPU ID0 initially polls with a high rate, the others wait for an event
				- once the pull pointer has been locked, the polling rate is set to high and no further locks are issued
				- if a SPU sees a locked pulling pointer not locked by itself, it resets to a slow polling rate
				- once a job has been fetched, the next SPU (SPU with ID 0 -> SPU with ID 1 and so on) is signaled to start polling
				- once a SPU has finished processing a job chain, the next SPU is signaled to enter wait state
				  (because it does itself change to polling state), the polling frequency is reseted to high

	loops:- each job can have >= 1 packets which are basically consecutive function calls
					- cache is flushed after all have been finished
				- if a job runs via a producer / consumer queue, each packet flushes the cache
					- it cannot have the inner loop packets as above in this mode

	memory layout:

	The LS memory map is organized as follows:
	[TOP Address = 0]
		SPUDriver including Cache
		Page directory
		per Job:
			Info packet (SInfoBlock)
			job function code and data
			parameter area (input into PFNEXECUTE)		
			memory transferred via DMA list
		Stack (at 752 below top)
		384 bytes Initial Loader / 4 * 128 byte static areas for copying stuff here
	[BOTTOM Address = scLSSize]
	
*/ 

#if defined(PS3)
#if defined(__SPU__)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <platform.h>
#include <stdio.h>
#include <spu_intrinsics.h>
#include <sys/spu_thread.h>
#include "SPU.h"
#include <IJobManSPU.h>
#include "../PPU/PPU.h"
#include "Memory.h"
#include "SPUMemManager_spu.h"
//include header files for dependency generation (to force recompilation of all jobs)
#include <SPUJobBase.h>
#include "SPUUtilities.h"
#include "./Cache/CacheDefs_spu.h"
#include "CodePage/SPUPages.h"
#include "CodePage/SPUPageLayout.h"
#include "../PPU/ProdConsQueue.h"

#if defined(SPU_CACHE_MISS_USE_ASM)
	extern "C" void FlushCacheComplete(const bool cDoSync);
#endif
extern "C" bool SetActivePages(const vec_uchar16, const vec_uchar16, const vec_uchar16, const vec_uchar16);

//switch on and fix inter SPU events which obtain jobs
//#define ENABLE_INTER_SPU_EVENTS

//enable it if the lock line lost reserv.event is used to wait for an update on the push pointer 
#define LOCK_USES_LLAR_EVENT 

//#if defined(SUPP_SN)
	#include <libsn_spu.h>
//#endif

namespace NSPU
{
	namespace NCache
	{
		extern void SyncAtomicDCache();

		extern vec_uint4* __restrict g_pSPUShadowCache;
		extern vec_uint4 g_AsyncRangesDirFrom;
		extern vec_uint4 g_AsyncRangesDirTo;
		extern uint32 g_CurSPUAsyncTag _ALIGN(16);
		extern vec_uint4 g_SPUAsyncDir;
		extern vec_uint4 g_PrefetchLRUDir;
		extern vec_uint4 g_PrefetchDir;
		extern uint32 g_SPUNumSets _ALIGN(16);
		extern uint32 g_CurAtomicEA _ALIGN(16);
#if defined(ENABLE_HAZARD_MODE)
		extern uint32 g_AtomicEAToStart _ALIGN(16);
#endif
#if defined(DO_SPU_PROFILING)
		extern vec_uint4 g_SPUCacheHitIncr;
		extern vec_uint4 g_SPUCacheProfIDCounter[];
		extern vec_uint4 g_SPUCacheCurProfID;
#endif
	}
	using NCache::g_SPUNumSets;

	extern SPageInfo *g_GlobalSPUPageDir _ALIGN(16);
	extern SPageState g_SPUPageStates[scMaxSPUPageCount] _ALIGN(16);
	extern vec_int4 g_SPUPageDir;
	extern vec_uint4 g_SPUPageLRUDir;
	extern uint8* __restrict g_SPUPageMem;
	extern vec_uint4 g_PageLRUCounter;
	extern uint32 g_SPUPageSize _ALIGN(16);
	extern vec_uint4 g_PageMemLower;
	extern vec_uint4 g_PageMemUpper;
	extern SReturnStackEntry g_ReturnStack[RETURN_STACK_MAX_ENTRIES] _ALIGN(16);
	extern SReturnStackEntry *g_pReturnStackTop;
	vec_uint4 g_CurPageIndex;					//current page index, code page miss handler keeps track
	vec_uint4 g_SPUPageIndexMask0;		//mask to quickly get index of a page slot (0..3)
	vec_uint4 g_SPUPageIndexMaskShl4;	//0, 16, 32, 48: mask to get offset into SPageState
	extern SPageDirInfo g_PageInfo _ALIGN(16);
	vec_uint4 g_SPUJobResolveFunc;		//address of job global resolve func

//#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
	uint32 g_sProgramTopLS _ALIGN(16);//program top address
//#endif 

	extern void TransferFuncProfStats();

	namespace NDriver 
	{
#if defined(SUPP_DABR)
		SDABR g_sDABR _ALIGN(16);
#endif
		//one cache line local buffer, also used by cache miss handler and callback sync
		uint8 g_sLSBuffer[128] _ALIGN(128) _DATASEC;
		uint8 g_sUnlockBuffer[128] _ALIGN(128) _DATASEC;
		//static variables to save stack and register passing of arguments
		SInfoBlock g_sInfoBlock _ALIGN(128) _DATASEC; 
		CSPUMemMan g_sMemMan _ALIGN(128) _DATASEC;

		static SJobQueuePos g_sJobQueuePullBack;
		uint32 g_sInfoPacketAddr _ALIGN(16);
		SJobStorageInfo g_scJobStorageLS;
		static uint32 g_sPullEA _ALIGN(16);
		static uint32 g_sCurPullPtr _ALIGN(16);
		static uint32 g_sSPUPacketSyncEA _ALIGN(16);
		static uint32 g_sPageEA _ALIGN(16);
		uint32 g_GcmContextLocalAddr;
		uint32 g_SPUId;
#if defined(SUPP_SN)
		uint32 g_sDebugState _ALIGN(16);
#endif
		//informs the PPU that debugging has been finished
		SPU_DRIVER_INLINE
		void ReleaseDebugging()
		{
#if defined(SUPP_SN)
			if(g_sDebugState)
				spu_writech(SPU_WrOutIntrMbox, NPPU::scDebugCallbackPort | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
#endif
		}

		//word 0: last program EA
		//word 1: last page mode
		static vec_uint4 g_sLastProcessData;
		//word 0: 1 if SPU has obtained the pull locked
		//word 1: current idle count
		static vec_uint4 g_sLockActive _ALIGN(16);
		#define FAST_POLLING_IDLE_LOOP_CNT 300		//idle loop count for SPU 0 and if lock is to be obtained for the 1st time
#if defined(ENABLE_INTER_SPU_EVENTS)
		#define SLOW_POLLING_IDLE_LOOP_CNT 4000		//idle loop count for SPU 1..4 and if lock has been obtained by another SPU
#else
		#define SLOW_POLLING_IDLE_LOOP_CNT 4000		//idle loop count for SPU 1..4 and if lock has been obtained by another SPU
#endif

		uint32 g_DestMemAreaEA _ALIGN(16);
		//word JOB_SPAWN_PUSH_WORD: push pointer if to be set at parents exit (same AS curAddr in PPU::SJobQueuePos)
		//word JOB_SPAWN_STATE_WORD: turns 1 if a job has been spawned (callback or external job state must not be called)
		vec_uint4 g_JobSpawnRegister;
		vec_uint4 g_sZero;		//need some 0 to transfer, reused outside too

#if !defined(_NO_SPU_ASSERT)
		uint32 g_FuncTableEntryCount;
#endif

#if defined(DO_SPU_PROFILING)
		SJobPerfStats g_PerfStats _ALIGN(16);
		uint32 g_DestProfAreaEA;
		uint32 g_JobFetchTime;
#endif
#if defined(SUPP_SPU_FRAME_STATS)
		uint32 g_DestStatsAreaEA;
		uint32 g_FrameProfileDataBase _ALIGN(16);
#endif
#if defined(SUPP_PRINTF)
		uint32 g_DestPrintfAreaEA _ALIGN(16);
#endif
		uint32 g_DestCustomCallbackAreaEA;

#ifdef DO_SPU_FUNCPROFILING
		NSPU::SFuncProfSPUTiming g_FuncProfData _ALIGN(16);
#endif

		//to avoid polling on n the bus for the SPUs, signals are send
		//only IDs 0..3 are sending, ID 4 does not do anything
		//word 0: status if to poll for a job or not
		#define STATE_EVENT_POLLING 0
		#define STATE_EVENT_WAIT 1
		//word 1: event status of next SPU
		#define STATE_NEXT_SPU_ACTIVATED 0
		#define STATE_NEXT_SPU_DEACTIVATED 1
		#define STATE_NEXT_SPU_NOT_AVAIL 2
		//word 2: available
		//word 3: ID of SPU to send a signal to
#if defined(ENABLE_INTER_SPU_EVENTS)
		static vec_uint4 g_sEventStatNextSPU _ALIGN(16);
		static uint32 g_SignalBuffer[4] _ALIGN(16);//signal buffer for mmio
#endif

		SPU_DRIVER_INLINE
		const uint32 GetLastProgramEA()
		{
			return spu_extract(g_sLastProcessData, 0);
		}

		SPU_DRIVER_INLINE
		void SetLastProgramEA(const uint32 cEA)
		{
			g_sLastProcessData = spu_insert(cEA, g_sLastProcessData, 0);
		}

		SPU_DRIVER_INLINE
		void ResetLastProcessData()
		{
			g_sLastProcessData = (vec_uint4){0};
			g_sLastProcessData = spu_insert(0xFFFFFFFF, g_sLastProcessData, 1);
		}

		SPU_DRIVER_INLINE
		const NPPU::EPageMode GetLastPageMode()
		{
			return (NPPU::EPageMode)spu_extract(g_sLastProcessData, 1);
		}

		SPU_DRIVER_INLINE
		void SetLastPageMode(const NPPU::EPageMode cMode)
		{
			g_sLastProcessData = spu_insert((uint32)cMode, g_sLastProcessData, 1);
		}

		SPU_DRIVER_INLINE
		void ResetJobSpawnReg()
		{
			g_JobSpawnRegister = spu_splats((uint32)0);
		}

		SPU_DRIVER_INLINE
		const bool IsJobSpawned()
		{
			return spu_extract(g_JobSpawnRegister, JOB_SPAWN_STATE_WORD) != 0;
		}

		SPU_DRIVER_INLINE
		void ResetJobStorage()
		{
			//reset job data area and transfer LS address to main memory
			*(vec_uint4*)&g_scJobStorageLS = (vec_uint4){0,0,0,0};//allow printf handling before first job
			uint32 jobStorageLSAddr _ALIGN(16);
			jobStorageLSAddr = (uint32)&g_scJobStorageLS;
			MemcpyMain((NSPU::NDriver::g_sPageEA + offsetof(NSPU::SPageDirInfo,jobStorageLS)),&jobStorageLSAddr,4, g_scDMAPageTag0);
			SyncMemory(g_scDMAPageTag0);
		}

		SPU_DRIVER_INLINE
		const uint32 RetrieveJobSpawnPush()
		{
			return spu_extract(g_JobSpawnRegister, JOB_SPAWN_PUSH_WORD);
		}

		SPU_DRIVER_INLINE
		void ResetJobSpawnPush()
		{
			g_JobSpawnRegister = spu_insert((uint32)0, g_JobSpawnRegister, JOB_SPAWN_PUSH_WORD);
		}

		SPU_DRIVER_INLINE
		const uint32 GetJobSpawnPushLS()
		{
			return (uint32)&g_JobSpawnRegister + JOB_SPAWN_PUSH_WORD * sizeof(int);
		}

		SPU_DRIVER_INLINE
		const uint32 GetJobSpawnStateLS()
		{
			return (uint32)&g_JobSpawnRegister + JOB_SPAWN_STATE_WORD * sizeof(int);
		}

		SPU_DRIVER_INLINE
		void HandleJobSpawn()
		{
			const uint32 cPushEA = RetrieveJobSpawnPush();
			IF(IsJobSpawned() && cPushEA != 0, false)
			{
				//unlock PPU job queue and transfer new push address back
				const uint32 cPushEA			= g_PageInfo.ppuSyncEA + offsetof(SJobQueuePos, curAddr) - offsetof(SJobQueuePos, lockObtained);
				const uint32 cSpinLockEA	= g_PageInfo.ppuSyncEA + sizeof(NPPU::SQueueNodeSPU);
				assert((cPushEA & 0xF) == (JOB_SPAWN_PUSH_WORD * sizeof(int)));
				//important set push pointer last (so that all data are valid once the other SPUs fetch them)
				MemcpyMainFenced(cPushEA, (TAddrLS)GetJobSpawnPushLS(), sizeof(int), g_scJobInfoTag);
				//now unlock
				MemcpyMainFenced(cSpinLockEA, (volatile TAddrLS)&g_sZero, 4, g_scJobInfoTag);//transfer 0 back to unlock
				//sync in place since next packet could spawn new job
				SyncMemory(g_scJobInfoTag);
				//reset push address
				ResetJobSpawnPush();
			}
		}

		//locks a job entry position from SPU in main memory using mfc atomics
		//inlined since it is just called once
		SPU_DRIVER_INLINE
		const bool Lock(const uint32 cSPUID)
		{
			//if we have not already obtained the lock, just copy current contents here
			int status;
			uint32 event;
			volatile SJobQueuePos *pLS = (volatile SJobQueuePos*)g_sLSBuffer;
			mfc_prep(pLS, g_sPullEA);
			do
			{
				mfc_getllar_again();
				mfc_read_atomic_status();
				IF(pLS->lockObtained == 0, true)
				{
					//write SPU, this way PPU knows which SPU currently has the lock (for profiling/debugging)
					pLS->lockObtained = cSPUID+1;
					mfc_putllc_again();
					status = mfc_read_atomic_status();
				}
				else
				{
					//another SPU has locked the queue, exit and do some idle loops
					return false;
				}
			}
			while(status != 0);
			//update current pull pointer on PPU (only this SPU can now change it)
			g_sCurPullPtr = pLS->curAddr;
			g_sLockActive = spu_insert((uint32)1, g_sLockActive, 0);//this SPU has obtained the lock
			return true;
		}

		//gets a job from the job queue
		//returns true if a job has been pulled from, false if no job is currently available
		SPU_DRIVER_INLINE
		const bool GetJobFromJobQueue(const uint32 cSPUID)
		{
			const uint32 cPushEA = g_sPullEA - NPPU::scJobInfoPushPullAddressDiff;
			volatile SJobQueuePos *pLS = (volatile SJobQueuePos*)g_sUnlockBuffer;
			//fetch job til we find a fetchable one, if pull pointer hits push pointer, break and idle a bit
			while(1)
			{
				if(spu_extract(g_sLockActive, 0) == 0)
				{
					if(!Lock(cSPUID))
					{
						//switch to slow polling since another SPU has obtained the lock, this way we occupy the bus least
						NSPU::NDriver::g_sLockActive = spu_insert(SLOW_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
						return false;//lock was acquired by another SPU, exit and do some idle loops
					}
					NSPU::NDriver::g_sLockActive = spu_insert(FAST_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
				}
#if defined(LOCK_USES_LLAR_EVENT)
				//get a reservation for the push pointer and use the lost reserv.event to quickly react to a write there
				//since only this SPU has acquired the lock, the pull pointer does not need to be checked
				uint32 llEvent;
	#if !defined(FAST_UNSAFE_LL_ENABLE)
				spu_write_event_mask(0);//discard previous (or phantom) events, as needed
				IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
				{
					llEvent = spu_readch(MFC_RD_EVENT_STATUS);
					spu_writech(MFC_WR_EVENT_ACK, llEvent);
				}
	#endif//FAST_UNSAFE_LL_ENABLE
				spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
				mfc_prep((void*)pLS, cPushEA);//transfer push here
				while(1)
				{
					mfc_getllar_again();//transfer push here
					mfc_read_atomic_status();
					IF((unsigned int)*(volatile unsigned int*)(&pLS->curAddr) == g_sCurPullPtr, false)
					{
						//snoop on a write to push
						llEvent = spu_readch(MFC_RD_EVENT_STATUS);
						spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
	#if defined(DO_SPU_PROFILING)
						spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
	#endif
						continue;
					}
					break;
				}
#else	//LOCK_USES_LLAR_EVENT
				//get memory here (otherwise we have done it already through Lock)
				//check if there is a new job to be pulled from PPU queue (check push pointer)
				MemcpyLS((void*)pLS, cPushEA, 128, g_scJobInfoTag);
				SyncMemory(g_scJobInfoTag);
				//assume false to have it faster if there is a job, otherwise we are happy to occupy the bus less
				IF(pLS->curAddr == g_sCurPullPtr, false)
					return false;
	#if defined(DO_SPU_PROFILING)
				spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
	#endif
#endif//LOCK_USES_LLAR_EVENT
				//pop the job entry among its parameters
				MemcpyLS((TAddrLS)&g_sInfoBlock, g_sCurPullPtr, sizeof(SInfoBlock), g_scJobInfoBlockTag);
				g_sInfoPacketAddr = g_sCurPullPtr;
				int status;
				//update pull pointer and write back unconditional
				uint32 addr = g_sCurPullPtr + NSPU::NDriver::scSizeOfSJobQueueEntry;
				g_sJobQueuePullBack.curAddr		= CondSelEq(addr, pLS->topAddr, pLS->baseAddr, addr);
				g_sJobQueuePullBack.topAddr		= pLS->topAddr;
				g_sJobQueuePullBack.baseAddr	= pLS->baseAddr;
				SyncMemory(g_scJobInfoBlockTag);
				IF(g_sInfoBlock.IsFetchable(), true)//if we have found an invalid entry (which is already in progress from last queue loop), fetch next one
				{
					g_sLockActive = spu_insert((uint32)0, g_sLockActive, 0);//reset locking info
					//do not unlock before we have actually found a valid job
					MemcpyMain(g_sPullEA, (TAddrLS)&g_sJobQueuePullBack, 16, 0);
#ifdef LOCK_USES_LLAR_EVENT
					spu_write_event_mask(0);
	#if !defined(FAST_UNSAFE_LL_ENABLE)
					IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
					{
						llEvent = spu_readch(MFC_RD_EVENT_STATUS);
						spu_writech(MFC_WR_EVENT_ACK, llEvent);
					}
	#endif//FAST_UNSAFE_LL_ENABLE
#endif //LOCK_USES_LLAR_EVENT
					break;
				}
				//update pull pointer
				g_sCurPullPtr = g_sJobQueuePullBack.curAddr;
			}
			return true;
		}

#if defined(SUPP_SPU_FRAME_STATS)
		//updates the job tick count and call count atomically
		SPU_DRIVER_INLINE
		void TransferSPUJobFrameStats(const uint32 cTicks, const uint32 cAddJobInvoc)
		{
			const uint32 cStatsEA = (g_FrameProfileDataBase + (g_sInfoBlock.frameProfIndex << 4)) & ~15;
			char frameBuf[128] _ALIGN(128);
			mfc_prep((void*)frameBuf, cStatsEA & ~127);
			vec_uint4 *pFrameProfBuf = (vec_uint4*)&frameBuf[cStatsEA & 127];
			const float cInvTB = 1000.f / 79800.f;//inverse ticks per usec
			int status;
			do 
			{
				//get lock for queue
				mfc_getllar_again();
				mfc_read_atomic_status();
				//encode the conversion of decrementer ticks to usecs here explicitly
				const uint32 numJobs	= spu_extract(*pFrameProfBuf, 1);
				const uint32 curMS		= spu_extract(*pFrameProfBuf, 0);
				*pFrameProfBuf = spu_insert(curMS + (uint32)((float)cTicks * cInvTB), *pFrameProfBuf, 0);
				*pFrameProfBuf = spu_insert(1 + numJobs + cAddJobInvoc, *pFrameProfBuf, 1);
				mfc_putllc_again();
				status = mfc_read_atomic_status();
			}while(status != 0);
		}
#endif

		//returns true if we have to transfer a zero back for an external job state tracking
		//use case without branches expects a callback and an independent job
		//works as follows: (lock is located i  first byte of cache line)
		//	- get sync cache line from PPU mem
		//	- if not already 0, loop til to obtained lock, otherwise toggle callback
		//	- write back decremented count transfer back atomically, loop til successful
		//	- unlock
		//returns true if job needs to be disabled by transferring a 0 to the info block
		//rJobFinished is set to 1 if not multiple SPUs are still processing job
		//if a job has been spawned, it just adds
		SPU_DRIVER_INLINE
		const bool HandleCallback(uint32& rJobFinished)
		{
			//now toggle callback function if requested, callback is 16 bit index
			//	do it early since it takes long time and the job state has to be tested anyway (which is DMA'd with barrier)
			if(g_sInfoBlock.callbackIndex == SInfoBlock::scNoIndex && g_sInfoBlock.GetExtJobStateAddress() == 0)
			{
				rJobFinished = 1;
				return true;
			}
			IF(g_sInfoBlock.spuPacketSyncIndex != SInfoBlock::scNoIndex, false)
			{
				//decrease counter on PPU atomically if it is not 0, otherwise we have to trigger the callback
				//this way it is ensured that only one SPU toggles it
				uint32 event;
				int status;
				volatile uint8 *pSyncBuffer = (volatile uint8*)g_sLSBuffer;
				mfc_prep(pSyncBuffer, g_sSPUPacketSyncEA);
				do
				{
					status = 1;
					mfc_getllar_again();
					mfc_read_atomic_status();
					IF(pSyncBuffer[g_sInfoBlock.spuPacketSyncIndex] == 0, false)
					{
						rJobFinished = 1;
						if(g_sInfoBlock.GetExtJobStateAddress() != 0)
						{
							MemcpyMainFenced(g_sInfoBlock.GetExtJobStateAddress(), (volatile TAddrLS)&g_sZero, 4, g_scDMAOutputTag);
							return true;
						}
						spu_writech(SPU_WrOutIntrMbox, (uint32)g_sInfoBlock.callbackIndex | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
						return false;//callback does it
					}
					IF(pSyncBuffer[0] == 0, true)//still unlocked
					{
						pSyncBuffer[0] = 1;
						mfc_putllc_again();
						status = mfc_read_atomic_status();
					}
				}
				WHILE(status != 0, false);

				//decrement and write back atomically
				do
				{
					mfc_getllar_again();
					mfc_read_atomic_status();
					pSyncBuffer[0] = 0;//unlock
					--pSyncBuffer[g_sInfoBlock.spuPacketSyncIndex];//decrement
					mfc_putllc_again();
					status = mfc_read_atomic_status();
				}
				WHILE(status != 0, false);
				return true;
			}
			rJobFinished = 1;
			IF(!IsJobSpawned(), true)//no multiple packets
			{
				if(g_sInfoBlock.GetExtJobStateAddress() != 0)
				{
					MemcpyMainFenced(g_sInfoBlock.GetExtJobStateAddress(), (TAddrLS)&g_sZero, 4, g_scDMAOutputTag);
					return true;
				}
				spu_writech(SPU_WrOutIntrMbox, (uint32)g_sInfoBlock.callbackIndex | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
				return false;//callback does it
			}
			return true;
		}

#if defined(ENABLE_INTER_SPU_EVENTS)
		SPU_DRIVER_INLINE
		void SendSignal2(const uint32 cSignal, const uint32 cSPUId)
		{
			//write via mfc to the problem register of the SPU cSPUId
			const uint32 cSNR2Address = 
				NPPU::scPCRawSPUOffset * cSPUId + NPPU::scPCRawSPUBaseAddr + NPPU::scPCRawSPUProbOffset + NPPU::scPCSigNotify2;
			g_SignalBuffer[3] = cSignal;
			spu_mfcdma32(&g_SignalBuffer[3], cSNR2Address, sizeof(uint32), 2, MFC_PUT_CMD);
		}
#endif

		SPU_DRIVER_INLINE
		void PatchJobEntry(const NPageBin::SJob* const cpJob, const uint32 cPageMemAddr)
		{
			//patches branch and its hint for the job to the respective entry function of its first page
			const uint32 cDestPageLS = cPageMemAddr + ((uint32)cpJob->destPageOff << 2);
			uint32* pBranchInstr	= (uint32*)&((uint8*)cpJob)[cpJob->branchOff << 2];
			const uint32 cEncodedBranchDest = ((cDestPageLS >> 2) << 7);
			*pBranchInstr |= cEncodedBranchDest;//encode absolute branch destination
			//if no branch hint exists, the offset points to the branch instruction itself (same value applied)
			uint32* pBHInstr = (uint32*)&((uint8*)cpJob)[cpJob->bhOff << 2];
			*pBHInstr				|= cEncodedBranchDest;//encode absolute branch hint instruction
		}

		//prefetch first pages and sync the first one itself
		SPU_DRIVER_INLINE
		void PrefetchPages(const NPageBin::SJob* const cpJob, const uint32 cDecrVal)
		{
			//now stream in other pages
			uint32 curPageMem = (uint32)g_SPUPageMem;
			for(unsigned int i=1; i<scMaxSPUPageCount; ++i)
			{
				const int16 cPageIndex = cpJob->initialPages[i];
				if(cPageIndex != -1)//test if set at all
				{
					const uint32 cPageEA		= g_GlobalSPUPageDir[cPageIndex].ea;
					const uint32 cPageSize	= g_GlobalSPUPageDir[cPageIndex].size;
					MemcpyLargeLS((TAddrLS)curPageMem, cPageEA, cPageSize, g_scDMAPageTag0+i, false);

#if defined(DO_SPU_PROFILING)
					g_PerfStats.pageMemTransferred += cPageSize;
					++g_PerfStats.pagesTransferred;
#endif

					curPageMem += g_SPUPageSize;
					g_SPUPageStates[i].curIndex = cPageIndex;
					g_SPUPageStates[i].curState = PAGE_STATE_STREAMING;
					g_SPUPageStates[i].transDecrEnd	= cDecrVal - (cPageSize >> BYTES_PER_DECR_TICK_SHIFT);
					SetPageIndex(cPageIndex, i);
				}
			}

			//sync first page
			if(g_SPUPageStates[0].curState != PAGE_STATE_READY)
			{
				SyncMemory(g_scDMAPageTag0);
				g_SPUPageStates[0].curState = PAGE_STATE_READY;
			}			
		}

		SPU_DRIVER_INLINE
		uint32 GetPageMemSize(const NPPU::EPageMode cPageMode, const uint16 cFirstPageIndex)
		{
			uint32 pageMemSize;
			const uint32 cMaxPageSize = g_sInfoBlock.GetMaxPageSize();
			g_SPUPageSize = cMaxPageSize;
			pageMemSize = (cPageMode == NPPU::ePM_Quad)?(cMaxPageSize << 2) : cMaxPageSize;
			IF(cPageMode == NPPU::ePM_Dual, 1)
			{
				pageMemSize = cMaxPageSize << 1; 
				//make sure only first 2 slots can be replaced (LRU and dir entry set for last 2 slots)
				UpdatePageLRUByIndex(2, 0xFFFFFFFF);
				UpdatePageLRUByIndex(3, 0xFFFFFFFF);
				g_SPUPageDir		= spu_insert(0x7FFFFFFF, g_SPUPageDir, 2);
				g_SPUPageDir		= spu_insert(0x7FFFFFFF, g_SPUPageDir, 3);
			}
			return pageMemSize;
		}

		SPU_DRIVER_INLINE
		void SetupPages
		(
			TAddrLS& rpCurAddr, 
			const NPPU::EPageMode cPageMode,
			const uint16 cFirstPageIndex
		)
		{
			const NPPU::EPageMode cLastPageMode = GetLastPageMode();
			//do not reuse code pages as this introduces potential issues
			g_SPUPageDir = spu_splats(-1);
			//get first page, transfer takes the longest
			uint32 firstPageSize = 0;
			g_CurPageIndex = (vec_uint4){(uint32)cFirstPageIndex};

#if !defined(_NO_SPU_ASSERT)
			//set up pages, issue earlier to zero memory
			ResetPageLRUCounter();
			ResetPageLRU();
			uint32 pageMemSize = GetPageMemSize(cPageMode, cFirstPageIndex);
#endif

			const uint32 cPageEA	= g_GlobalSPUPageDir[cFirstPageIndex].ea;
			firstPageSize					= g_GlobalSPUPageDir[cFirstPageIndex].size;
#if !defined(_NO_SPU_ASSERT)
			//zero page memory
			const uint32 cIters = (pageMemSize) / 16;
			const vec_uint4 cZero = spu_splats((uint32)0);
			vec_uint4*const __restrict pPageMem = (vec_uint4*)g_SPUPageMem;
			for(uint32 s=0; s<cIters; ++s)
				pPageMem[s] = cZero;
			//re-init return stack
			vec_uint4 *const __restrict pSReturnStackEntryVec = (vec_uint4*)&g_ReturnStack[0];
			for(uint32 u=1; u<RETURN_STACK_MAX_ENTRIES; ++u)//entry is 16 bytes
				pSReturnStackEntryVec[u] = cZero;
#endif
			MemcpyLargeLS(g_SPUPageMem, cPageEA, firstPageSize, g_scDMAPageTag0, false);
			g_SPUPageStates[0].curIndex	= cFirstPageIndex;
			g_SPUPageStates[0].curState	= PAGE_STATE_STREAMING;//will be ready before job starts (forced)
			SetPageIndex(cFirstPageIndex, 0);
#if defined(DO_SPU_PROFILING)
			g_PerfStats.pageMemTransferred  += firstPageSize;
			++g_PerfStats.pagesTransferred;
#endif
#if defined(DO_SPU_PROFILING)
			g_PerfStats.firstPageSize  = g_GlobalSPUPageDir[cFirstPageIndex].size >> 4;
#endif
			NSPU::g_SPUPageStates[1].Reset();
			NSPU::g_SPUPageStates[2].Reset();
			NSPU::g_SPUPageStates[3].Reset();

#if defined(_NO_SPU_ASSERT)
			//set up pages
			ResetPageLRUCounter();
			ResetPageLRU();
			uint32 pageMemSize = GetPageMemSize(cPageMode, cFirstPageIndex);
#endif

			SetLastPageMode(cPageMode);
			const uint32 cSPUPageSize = g_SPUPageSize;

			//set the lower and upper address of the page memory, used to compare against it with spu_cgt in CodePagingCallMissHandler(asm)
			vec_uint4 pageMemLower = spu_promote((uint32)g_SPUPageMem, 0);
			uint32 incrPageBaseAddr	= cSPUPageSize + (uint32)g_SPUPageMem;
			pageMemLower				= spu_insert(incrPageBaseAddr, pageMemLower, 1);
			incrPageBaseAddr	 += cSPUPageSize;
			pageMemLower				= spu_insert(incrPageBaseAddr, pageMemLower, 2);
			incrPageBaseAddr	 += cSPUPageSize;
			pageMemLower				= spu_insert(incrPageBaseAddr, pageMemLower, 3);
			g_PageMemUpper			= spu_add(pageMemLower, cSPUPageSize);
			g_PageMemLower			= pageMemLower;

			UpdatePageLRUByIndex(0, 1);//make sure first page does not get replaced
			IncrementPointer(rpCurAddr, pageMemSize);
			g_pReturnStackTop			= &g_ReturnStack[0];
		}

		SPU_DRIVER_INLINE
		void ResetCacheControl()
		{
			//reset cache
			const vec_uint4 cZero = spu_splats((uint32)0);
			const int cNumSets = g_SPUNumSets;
			//reset cache dir entries, 4 at once to give the branch hint a chance to be set
			for(unsigned int set=0; set<cNumSets; set += 4)
			{
				g_pSPUCacheLRUCtrl[set]		= cZero;
				g_pSPUCacheDir[set]				= cZero;
				g_pSPUCacheLRUCtrl[set+1] = cZero;
				g_pSPUCacheDir[set+1]			= cZero;
				g_pSPUCacheLRUCtrl[set+2] = cZero;
				g_pSPUCacheDir[set+2]			= cZero;
				g_pSPUCacheLRUCtrl[set+3] = cZero;
				g_pSPUCacheDir[set+3]			= cZero;
			}
			g_LRUCounterIncr = g_LRUCounter = cZero;//reset LRU counters
			NCache::g_AsyncRangesDirFrom		= spu_splats((uint32)0xFFFFFFFF);
			NCache::g_AsyncRangesDirTo			= cZero;
			NCache::g_CurSPUAsyncTag				= 0;
			NCache::g_SPUAsyncDir						= cZero;
			NCache::g_PrefetchLRUDir	= NCache::g_PrefetchDir = cZero;
		}
		
		SPU_DRIVER_INLINE
		void InitCache()
		{
			const vec_uint4 cZero = spu_splats((uint32)0);
#if !defined(SPU_CACHE_MISS_USE_ASM)
			g_CurWrittenEA	  = (vec_uint4){0}; 
#endif

#if defined(DO_SPU_PROFILING)
			NCache::g_SPUCacheHitIncr = cZero;
			NCache::g_SPUCacheHitIncr = spu_insert((uint32)1, NCache::g_SPUCacheHitIncr, 0);
			//init profiling data
			for(unsigned int i=0; i<MAX_PROF_ID * (4*3) / sizeof(vec_uint4); ++i)
				NCache::g_SPUCacheProfIDCounter[i] = (vec_uint4){0};
			NCache::g_SPUCacheCurProfID = (vec_uint4){0};
#endif

			NCache::g_CurAtomicEA				= 0;
#if defined(ENABLE_HAZARD_MODE)
			NCache::g_AtomicEAToStart		= 0; 
#endif

			g_SPUPageIndexMask0 = (vec_uint4){0};
			g_SPUPageIndexMask0 = spu_insert(1, g_SPUPageIndexMask0, 1); 
			g_SPUPageIndexMask0 = spu_insert(2, g_SPUPageIndexMask0, 2);
			g_SPUPageIndexMask0 = spu_insert(3, g_SPUPageIndexMask0, 3);

			g_SPUPageIndexMaskShl4 = (vec_uint4){0};
			g_SPUPageIndexMaskShl4 = spu_insert(16, g_SPUPageIndexMaskShl4, 1); 
			g_SPUPageIndexMaskShl4 = spu_insert(32, g_SPUPageIndexMaskShl4, 2);
			g_SPUPageIndexMaskShl4 = spu_insert(48, g_SPUPageIndexMaskShl4, 3);
			g_sZero = cZero;

			//initialize return stack
			vec_uint4 *const __restrict pSReturnStackEntryVec = (vec_uint4*)&g_ReturnStack[0];
			for(uint32 u=0; u<RETURN_STACK_MAX_ENTRIES; ++u)//entry is 16 bytes
				pSReturnStackEntryVec[u] = cZero;
			const SReturnStackEntry cTopEntry = {0, SReturnStackEntry::cIsJobPage, 0, 0, 0, 0, 0, 0};
			//push a first index (which is the job call to the first page)
			//top index is the offset to the current top index (0, 16, 32..) (16 = sizeof(SReturnStackEntry))
			g_ReturnStack[0]			= cTopEntry;
		}

		SPU_DRIVER_INLINE
		void SetupCache(const uint32 cTopLSALigned)
		{
			//set up cache
			//determine cache size: 0, 8, 16, 32 or 64 KB
			//each cache requires: size * 2(cache + shadow cache) + size / 8 (LRU and DIR) bytes
			const uint32 cMaxRequCacheSize = g_sInfoBlock.GetMaxCacheSize();
			if(cMaxRequCacheSize == 0)
			{
#if defined(DO_SPU_PROFILING)
				g_PerfStats.cacheSize = 0;
#endif
				g_sProgramTopLS = cTopLSALigned;
				g_SPUNumSets		= 0;
//				g_CurAtomicEA		= 0;
				return;//bypass cache
			}
			const uint32 cRemSize = (uint32)(255 * 1024) - (uint32)g_sInfoBlock.GetMinStackSize() - (uint32)cTopLSALigned;
			uint32 cacheSize = 
				cRemSize >= TOTAL_CACHE_SIZE(64*1024)?64*1024 :
				cRemSize >= TOTAL_CACHE_SIZE(32*1024)?32*1024 :
				cRemSize >= TOTAL_CACHE_SIZE(16*1024)?16*1024 : 
				cRemSize >= TOTAL_CACHE_SIZE(8*1024)?8*1024 : 4*1024;
			cacheSize = (cacheSize>cMaxRequCacheSize)?cMaxRequCacheSize : cacheSize;
#if defined(DO_SPU_PROFILING)
			g_PerfStats.cacheSize = (cacheSize >> 10);//in KB
#endif
			g_pSPUCache = (vec_uint4*)(void*)cTopLSALigned;//entire cache memory
			//cache copy for each line, contains original state for each bit in a cacheline to only write back 
			//	the bits which have changed (if any)
			//	costs considerable amount of memory (same as cache itself), but only way to ensure to only write back
			//		the changed bits and enables us to not mark anything as dirty
			NSPU::NCache::g_pSPUShadowCache	= (vec_uint4*)((uint8*)g_pSPUCache + cacheSize);
			//cache directory containing line addresses (1x uint32per cache line), 0 indicates no mapping
			g_pSPUCacheDir									= (vec_uint4*)((uint8*)NSPU::NCache::g_pSPUShadowCache + cacheSize);
			//cache LRU Control, 1x uint32 counter per cache line in each set
			g_pSPUCacheLRUCtrl							= (vec_uint4*)((uint8*)g_pSPUCacheDir + (cacheSize >> 5));
			//number of sets
			g_SPUNumSets										= (((cacheSize >> scSPUCacheLineSizeShift)) >> scSPUCacheSetNumWaysShift);
			
			g_sProgramTopLS = cTopLSALigned + TOTAL_CACHE_SIZE(cacheSize);
#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
			assert(NSPU::g_sProgramTopLS <= (256-16) * 1024);
#endif 
			//initialize global cache constants
			g_SPUCacheLineOffValues = spu_splats((uint32)0);//create 384, 256, 128, 0
			g_SPUCacheLineOffValues = spu_insert(128, g_SPUCacheLineOffValues, 1); 
			g_SPUCacheLineOffValues = spu_insert(256, g_SPUCacheLineOffValues, 2);
			g_SPUCacheLineOffValues = spu_insert(384, g_SPUCacheLineOffValues, 3);
			g_SPUCacheLineOffValues = spu_add(g_SPUCacheLineOffValues, spu_splats((uint32)g_pSPUCache));

			g_SetMaskSL4	= spu_insert((uint32)((g_SPUNumSets-1) << 4), g_SetMaskSL4, 0);//overwrite only the lower 4 bytes
			
			ResetCacheControl();
		}

		SPU_DRIVER_INLINE
		void SetupFunctionTable(const uint32 cFuncTableLSAddr, const uint32 cFuncTableSize)
		{
			g_SetMaskSL4	= spu_insert(cFuncTableLSAddr, g_SetMaskSL4, 3);//address of function table (index -> page ID,off)
			g_SetMaskSL4	= spu_insert(cFuncTableLSAddr + (cFuncTableSize >> 1), g_SetMaskSL4, 2);//address of debug PPU address table
		}

		SPU_DRIVER_INLINE
		void IncrQueuePullPointer(uint32& rCurPullAddr, const uint32 cIncr, const uint32 cQueueStart, const uint32 cQueueEnd)
		{
			const uint32 cNextPull = rCurPullAddr + cIncr;
			rCurPullAddr = (cNextPull >= cQueueEnd)?cQueueStart : cNextPull;
		}

		SPU_DRIVER_INLINE
		void StoreJobInfo(TAddrLS pJobEntry)
		{
			//store information to retrieve current job and to map PC to elf
			g_scJobStorageLS.binJobAddrEA		= (uint32)g_sInfoBlock.eaDMAJobAddress;
			g_scJobStorageLS.lsStartJob			= (uint32)pJobEntry;
			g_scJobStorageLS.lsStartPageMem	= (uint32)g_SPUPageMem;
			g_scJobStorageLS.pageSize				= (uint32)g_SPUPageSize;
		}

		SPU_DRIVER_INLINE
		void FillBSS(TAddrLS pBSS, uint32 bssSize)
		{
			//fill up to next 16 byte boundary
			uint32 bssOff = 16 - (uint32)pBSS & 15;
			bssOff = (bssOff > bssSize)?bssSize :  bssOff;
			for(uint32 i=0; i<bssOff/4; ++i)//assume multiple of 4 bytes
				((uint32*)pBSS)[i] = 0;
			bssSize -= bssOff;
			const vec_uint4 cZero = spu_splats((uint32)0);
			vec_uint4 * __restrict pBSS16 = (vec_uint4*)(void*)(((uint32)pBSS+15)&~15);
			//first in multiple of 128 bytes
			for(uint32 i=0;i<(bssSize/128);++i)
			{
				pBSS16[0] = cZero;
				pBSS16[1] = cZero;
				pBSS16[2] = cZero;
				pBSS16[3] = cZero;
				pBSS16[4] = cZero;
				pBSS16[5] = cZero;
				pBSS16[6] = cZero;
				pBSS16[7] = cZero;
				pBSS16 += 8;
			}
			//fill remaining bytes
			for(uint32 i=0;i<(bssSize & 127)/16;++i)
				*pBSS16++ = cZero;
			uint32 *pBSS4 = (uint32*)pBSS16;
			for(uint32 i=0;i<(bssSize & 15)/4;++i)
				*pBSS4++ = 0;
		}

		//processes a fetched job til no further packets are found
		//packets have all the same size in terms of parameter data and list size
		SPU_DRIVER_INLINE
		void ProcessJob(const uint32 cAvailMemStart, const uint32 cSPUID, const uint32 cMaxSPUs)
		{
#if defined(SUPP_SN)
			if(g_sInfoBlock.IsDriverDebugEnabled())
				__asm volatile ("stop 255");
			g_sDebugState = (uint32)g_sInfoBlock.IsDebugEnabled();
#endif
			//get the info packet, cDriverSize is a multiple of 16 (ensured by PPU)
			TAddrLS pCurAddr = (TAddrLS)cAvailMemStart;//128 byte aligned
#if defined(DO_SPU_PROFILING) || defined(SUPP_SPU_FRAME_STATS)
			//decrementer is reseted right before calling ProcessJob
	#if defined(DO_SPU_PROFILING)
			g_PerfStats.Reset();
			const uint32 cJobStartTime = spu_readch(SPU_RdDec);
			g_PerfStats.driverSize = cAvailMemStart >> 4;
	#endif
#endif
			//setup of the cache and page memory, must come first since g_SPUPageMem remains constant
			const NPPU::EPageMode cPageMode = g_sInfoBlock.GetPageMode();
			const uint16 cFirstPageIndex = g_sInfoBlock.GetFirstPageIndex();
			const uint32 cJobSize = g_sInfoBlock.jobSize << 2;
			//job must first get transferred to available memory since data+bss from pages are remapped statically there
			TAddrLS pJobEntry = pCurAddr;
			const uint32 cFuncTableBinOff = g_sInfoBlock.funcTableBinOff << 2;
			const uint32 cFuncTableSize		= g_sInfoBlock.funcTableSize << 2;
			uint32 cBSSSize = g_sInfoBlock.bssSize << 2;
			const uint32 cBSSOff	= g_sInfoBlock.bssOff << 2;
			TAddrLS pBSS = (TAddrLS)((uint8*)pCurAddr + cBSSOff);
#ifdef DO_SPU_FUNCPROFILING
			uint32 alignedBssAddr = AlignSize128((uint32)pBSS + cBSSSize);
			g_FuncProfData.funcProfSPUTimingAreaBss = alignedBssAddr;
			g_FuncProfData.funcProfileCount = g_sInfoBlock.funcProfTimingCount;
			IF(g_sInfoBlock.funcProfTimingCount, 1)
				cBSSSize = AlignSize128( (alignedBssAddr + AlignSize128(g_sInfoBlock.funcProfTimingCount*4*2)) - (uint32)pBSS );
#endif
			IncrementPointer(pCurAddr, AlignSize128(cJobSize + cBSSSize));
			g_SPUPageMem = (uint8*)pCurAddr;
			const uint32 cJobProgramEA = g_sInfoBlock.eaDMAJobAddress;
#if !defined(_NO_SPU_ASSERT)
			const int cReloadJob = cJobProgramEA != GetLastProgramEA();
#else
			const int cReloadJob = 1;
#endif
			SetupPages(pCurAddr, cPageMode, cFirstPageIndex);
			bool syncOnJob = false;
			//copy job only if not still present, useful since it it is by far the most time consuming setup job
			if(cReloadJob)
			{
				//copy job (text, data, rodata)
				MemcpyLargeLS(pJobEntry, cJobProgramEA, cFuncTableBinOff, g_scDMAJobTag, false);
				//copy function table
				const uint32 cFuncTableLSAddr = (uint32)pJobEntry + cFuncTableBinOff + cBSSSize;
				MemcpyLS(cFuncTableLSAddr, cJobProgramEA+cFuncTableBinOff, AlignSize16(cFuncTableSize), g_scDMAJobTag);
				//set up function pointer table
				SetupFunctionTable(cFuncTableLSAddr, cFuncTableSize);
				SetLastProgramEA(cJobProgramEA);
				syncOnJob = true;
			}
			
			//transfer bucket headers here, for __SPU__ we have a padding of 16 bytes
			MemcpyLS
			(
				(void*)NSPU::NDriver::g_sMemMan.GetBucketSPUAddr(),
				NSPU::NDriver::g_DestMemAreaEA,
				128, 
				g_scDMAPPUMemTag
			);
			//reload page info
			MemcpyLS(&NSPU::g_PageInfo, NSPU::NDriver::g_sPageEA, sizeof(NSPU::SPageDirInfo), g_scDMAPPUMemTag);
			ResetJobSpawnReg();
			uint32 queueAddress = 0;
			const bool cHasQueue = g_sInfoBlock.HasQueue();
			uint8* pQueueBuffer = NULL;
			if(cHasQueue)
			{
				queueAddress	= g_sInfoBlock.GetQueue();
//				assert((queueAddress & 127) == 0);//must be cache line aligned
				pCurAddr			= (TAddrLS)AlignSize128((uint32)pCurAddr);
				//transfer push/pull pointer here
				pQueueBuffer	= (uint8*)pCurAddr;
				//transfer pull/push ptr of queue here
				MemcpyLS(pQueueBuffer, queueAddress, 128, g_scDMAListTag);
				IncrementPointer(pCurAddr, 128);
			}
#if !defined(DO_SPU_PROFILING) && !defined(SUPP_SPU_FRAME_STATS)
			//do it now since this way we do not loose any performance
			spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif

#if defined(DO_SPU_PROFILING)
			g_PerfStats.jobSize = cJobSize >> 4;
			g_PerfStats.spuFetchTime = g_JobFetchTime;
#endif
#if defined(ENABLE_INTER_SPU_EVENTS)
			//now inform next SPU to change state from wait to polling if active(if not ID 4)
			const bool cStateNextSPUWaiting = ((spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 1) == STATE_NEXT_SPU_DEACTIVATED));
			IF(cStateNextSPUWaiting, true)
			{
				//send STATE_EVENT_POLLING signal to next SPU since we need 1 responsive SPU
				SendSignal2(STATE_EVENT_POLLING, spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 3));
				NSPU::NDriver::g_sEventStatNextSPU = spu_insert(STATE_NEXT_SPU_ACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);
			}
#endif
			//set thread id if necessary
			g_sInfoBlock.curThreadId = (g_sInfoBlock.curThreadId == SInfoBlock::scNoThreadId)?cSPUID : g_sInfoBlock.curThreadId;

			uint8* __restrict pParamArea;
			int syncOnFlushCache;
			volatile uint32* pQueuePull;
			volatile uint32* pQueuePush;
			volatile uint32* pQueueJobState;
			uint32 curPullAddr, curPushAddr;
			uint32 queueIncr, queueStart, queueEnd;
			uint32 curTopLS = (uint32)pCurAddr;
			uint32 keepCache = false;
			const uint32 cParamSize = (g_sInfoBlock.paramSize << 4);
			if(cHasQueue)
			{
				//sync transfer of queue here
				syncOnFlushCache = 1;//flush cache must be syncd
				pQueuePull			= (uint32*)&pQueueBuffer[NPPU::scProdConsPullOff];
				pQueuePush			= (uint32*)&pQueueBuffer[NPPU::scProdConsPushOff];
				pQueueJobState	= (uint32*)&pQueueBuffer[0];
				SyncMemory(g_scDMAListTag);			//sync transfer of next parameter packet
				//set push, pull and job state pointer
				curPullAddr			= *pQueuePull;
				//transfer first parameter packet here
				MemcpyLS(pCurAddr, curPullAddr, cParamSize + sizeof(NPPU::SAddPacketData), g_scDMAPPUMemTag);
				//get constant queue increment, start and end address
				queueIncr				= *(uint32*)&pQueueBuffer[NPPU::scProdConsPullIncr];
				queueStart			= *(uint32*)&pQueueBuffer[NPPU::scProdConsPullStart];
				queueEnd				= *(uint32*)&pQueueBuffer[NPPU::scProdConsPullEnd];

				curPushAddr			= *pQueuePush;
				IncrQueuePullPointer(curPullAddr, queueIncr, queueStart, queueEnd);
				pParamArea = (uint8*)pCurAddr;
				curTopLS += (cParamSize + sizeof(NPPU::SAddPacketData));
				keepCache = g_sInfoBlock.KeepCache();
			}
			else
			{
				pParamArea				= g_sInfoBlock.GetParamAddress();//embedded
				syncOnFlushCache	= (g_sInfoBlock.GetExtJobStateAddress() == 0)?1:0;
			}
			//definition of job function (takes pointer to transfered parameters as input param)
		#define JOB_HEADER_SIZE ((sizeof(NPageBin::SJob) + 127) & ~127)
//		#if defined(SUPP_SN)
		#if 1
			#ifdef DEBUG_HEADER_GUID_EXT_TEMP
				void (*pFnctExecute)(void*) = (void (*)(void*))((uint32)pJobEntry + JOB_HEADER_SIZE + 16 + 128);
			#else
				void (*pFnctExecute)(void*) = (void (*)(void*))((uint32)pJobEntry + JOB_HEADER_SIZE + sizeof(spu_mod_hdr));
			#endif
		#else
			void (*pFnctExecute)(void*) = (void (*)(void*))((uint32)pJobEntry + JOB_HEADER_SIZE);
		#endif

#if defined(DO_SPU_PROFILING)
			uint32 jobTicks = 0;
#endif

			NSPU::NDriver::g_sMemMan.Reset();//reset memory management

			bool disableJobAtEnd = true;
			bool firstLoopIt = true;
			uint32 addJobInvoc = 0;//count additional job invocations through prod./cons.queue

			curTopLS = AlignSize128(curTopLS);

#if defined(SUPP_DABR)
			g_sDABR.lsAddr	= (uint32)(void*)&NCache::SyncAtomicDCache;//reset dabr to some constant data(code)
			g_sDABR.oldVal	= *((uint32*)(void*)g_sDABR.lsAddr);
			g_sDABR.addData = 0;
			g_sDABR.ppuEA		= 0;
#endif

			StoreJobInfo(pJobEntry);

			FillBSS(pBSS, cBSSSize);

			SyncMemory(g_scDMAPPUMemTag);

//------------------------------------------prod/cons queue packet loop begin------------------------------------------------
StartParameterLoop:
			SetupCache(curTopLS);	//allow diff.cache settings for each job

			//only sync code if it was not present before
			if(syncOnJob)
			{
				syncOnJob = false;
//#if defined(_DEBUG)
#if 0
				SyncMemory(g_scDMAJobTag);
#else
				const uint32 cTagMask = (1<<g_scDMAJobTag);
				spu_writech(MFC_WrTagMask, cTagMask);
				//use polling in case of profiling since the large job seemed to hang up the read to the decrementer
#if !defined(DO_SPU_PROFILING)
				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
				spu_readch(MFC_RdTagStat);
#else			
				do {} WHILE(spu_mfcstat(MFC_TAG_UPDATE_IMMEDIATE) != cTagMask, false);
#endif
#endif //_DEBUG
#if !defined(_NO_SPU_ASSERT)
				g_FuncTableEntryCount = (uint32)(((NPageBin::SJob*)pJobEntry)->funcTableEntryCount);
#endif
//			uint32 *pFuncTable = (uint32*)((uint32)pJob + (uint32)pJob->funcTableOffset);
//			for(unsigned i=0; i<pJob->funcTableEntryCount;++i)
//			printf("   funcTable[%d]=0x%08x\n",i,pFuncTable[i]);
				PatchJobEntry((NPageBin::SJob*)pJobEntry, (uint32)g_SPUPageMem); 
			}

StartParameterLoopKeepCache:
			IF(!firstLoopIt, false)
			{
				//leave pages as they are but make sure the first one is present, patch again accordingly
				const bool cTransferring	= SetActivePages(spu_maskb((unsigned short)cFirstPageIndex), spu_maskb((unsigned short)65535), spu_maskb((unsigned short)65535), spu_maskb((unsigned short)65535));
				const vec_int4 cSplatID		= spu_splats((int)cFirstPageIndex);
				const int cFirstPageSlot	= GetPageSlot(cSplatID);
				PatchJobEntry((NPageBin::SJob*)pJobEntry, (uint32)(g_SPUPageMem + cFirstPageSlot * g_SPUPageSize));
				//sync first page if it was transferring
				IF(cTransferring, false)
				{
					g_SPUPageStates[cFirstPageSlot].curState = PAGE_STATE_READY;
					SyncMemory(g_scDMAPageTag0+cFirstPageSlot);
				}
			}
			else
				PrefetchPages((NPageBin::SJob*)pJobEntry, 0xFFFFFFFD);

#if defined(DO_SPU_PROFILING)
			const uint32 cJobExecStartTime = spu_readch(SPU_RdDec);
#endif

#if defined(DO_SPU_PROFILING)
			g_PerfStats.stackSize = (((NSPU::GetStackAddress() - g_sProgramTopLS)) >> 10);//in KB
#endif

#if !defined(_NO_SPU_ASSERT)
			const unsigned int cStackBefore = NSPU::GetStackAddress();
#endif

#ifdef DO_SPU_FUNCPROFILING
		g_FuncProfData.jobTimingAreaOffset = ((NPageBin::SJob*)pJobEntry)->funcProfTimingOff128 * 128;
#endif
/*
	typedef union { char c4[4]; uint16_t u16[2]; uint32_t u32; } Module_u;
	const Module_u s_mu = { { 'P', 'h', 'y', 's' } };
	__spu_insert_bookmark( 0xffaa );	// start marker 1
	__spu_insert_bookmark( s_mu.u16[0] );	// name
	__spu_insert_bookmark( s_mu.u16[1] );	// name
	__spu_insert_bookmark( 1 );	// level
	__spu_insert_bookmark( ((uint32)pFnctExecute-16) >> 2);		// LSA
	__spu_insert_bookmark( 0xffab );	// start marker 2
__spu_insert_bookmark( 0x3E0 );
*/
		pFnctExecute((void*)pParamArea);//execute job 
/*
typedef union { uint16_t u16[4]; uint32_t u32[2]; uint64_t u64; } GUID_u;
GUID_u guid;
qword cPageGUID = (qword)si_lqd((qword)spu_promote((uint32)pFnctExecute-16,0), 0);
qword insn = si_roti(cPageGUID, 7);
qword pattern = (qword)(vec_uchar16){0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13};
guid.u64 = si_to_ullong(si_shufb(insn, insn, pattern));
__spu_insert_bookmark( 0xffac );	// start marker 1
__spu_insert_bookmark( guid.u16[0] );	// guid
__spu_insert_bookmark( guid.u16[1] );	// guid
__spu_insert_bookmark( guid.u16[2] );	// guid
__spu_insert_bookmark( guid.u16[3] );	// guid
__spu_insert_bookmark( 0xffad );	// start marker 2
__spu_insert_bookmark( 0x7C00 );
*/
#if !defined(_NO_SPU_ASSERT)
			const unsigned int cStackAfter = NSPU::GetStackAddress();
			assert(cStackAfter == cStackBefore);
#endif

#if defined(DO_SPU_PROFILING)
			const uint32 cJobExecEndTime = spu_readch(SPU_RdDec);
			jobTicks += cJobExecStartTime - cJobExecEndTime;
#endif
			//first page must still be present at its position
			firstLoopIt = false;
			assert(IsPagePresent(spu_splats((int)cFirstPageIndex)));
//------------------------------------------inner packet loop end------------------------------------------------

			//flush cache	so that callbacks see all updated data reside on main memory as well
			//this is time critical, flushing is shortly faster than the callback wakeup on PPU
			//currently it is syncd to avoid any time critical issues
			IF(g_SPUNumSets > 0, 1)
			{
				IF(!keepCache, 0)
				{
	#if defined(SPU_CACHE_MISS_USE_ASM)
					FlushCacheComplete(syncOnFlushCache);
	#else
					NSPU::CSPUMemMan::FlushCacheComplete(syncOnFlushCache, true);
	#endif//SPU_CACHE_MISS_USE_ASM
				}
				else
				{
					//finish atomic transfers and invalidate prefetch buffer
					NCache::SyncAtomicDCache();
					NCache::g_PrefetchLRUDir	= NCache::g_PrefetchDir = spu_splats((uint32)0);
				}
			}
			uint32 jobFinished = 0;
			if(cHasQueue)
			{
				//tricky part of queue synchronization
				//update loop, loop til we either successfully wrote the updated pointers/state back or the push pointer has changed on PPU side
				mfc_prep(pQueueBuffer, queueAddress);
				int newJobFound = 0;//if a new job is found update pull pointer on PPU too
				uint32 jobStateToBeSet = NPPU::scJobFinished;
				do 
				{
					//get lock for queue
					mfc_getllar_again();
					mfc_read_atomic_status();
					//store pull pointer (not the incremented one if a new job is fetched, this way we
					//	can detect if the push pointer was really changed
					*pQueuePull			= curPullAddr;
					if(newJobFound == 0)
					{
						//check if push has been changed in the meantime
						curPushAddr = (curPullAddr != curPushAddr)?curPushAddr : *pQueuePush;//update if we have fetched all preceding packets
						if(curPullAddr != curPushAddr)//push pointer has been changed, init all for next parameter packet iteration
						{
							//transfer next parameter packet here even there are no more packets
							MemcpyLS(pParamArea, curPullAddr, cParamSize + sizeof(NPPU::SAddPacketData), g_scDMAPPUMemTag);
							//pull pointer is only changed on SPU, so no need to fetch it again
							IncrQueuePullPointer(curPullAddr, queueIncr, queueStart, queueEnd);
							++addJobInvoc;
							newJobFound = 1;
							jobStateToBeSet = NPPU::scJobRunning;
						}
					}
					*pQueueJobState = jobStateToBeSet;
					mfc_prep(pQueueBuffer, queueAddress);
					mfc_putllc_again();
				}while(mfc_read_atomic_status() != 0);

				if(newJobFound)
				{
					SyncMemory(g_scDMAPPUMemTag);
					//apply new stack size
					IF(!keepCache, 1)
					{
						NPPU::SAddPacketData* const __restrict cpAddPacketdata = (NPPU::SAddPacketData*)((uint8*)pParamArea + cParamSize);
						g_sInfoBlock.SetMinStackSizeKB(cpAddPacketdata->stackSizeKB);
						g_sInfoBlock.SetCacheMode((NPPU::ECacheMode)cpAddPacketdata->cacheMode);
						goto StartParameterLoop;//restart loop
					}
					goto StartParameterLoopKeepCache;
				}
				//problem: if cache gets flushed after other job starts, synchronization issues might occur
				IF(g_SPUNumSets > 0, 1)
				{
					IF(keepCache, 0)
	#if defined(SPU_CACHE_MISS_USE_ASM)
						FlushCacheComplete(syncOnFlushCache);
	#else
						NSPU::CSPUMemMan::FlushCacheComplete(syncOnFlushCache, true);
	#endif//SPU_CACHE_MISS_USE_ASM
				}
				jobFinished = 1;
#if defined(DO_SPU_PROFILING)
				if(g_sInfoBlock.eaJobPerfAddress != 0)
				{
					const uint32 cJobEndTime		= spu_readch(SPU_RdDec);
					//decrementer reading takes at least 20 cycles, 40 cycles per tick currently
					g_PerfStats.spuJobTime			= jobTicks - 1;
					g_PerfStats.spuSetupTime		= (uint16)((cJobStartTime - cJobEndTime - 1) - jobTicks);
					//cache stats written by SPUMemManager, transfer back
					MemcpyMain(g_sInfoBlock.eaJobPerfAddress, (TAddrLS)&g_PerfStats, sizeof(SJobPerfStats), g_scDMAOutputTag);
				}
#endif
			}
			else
			{
				HandleJobSpawn();//must not be called from packet jobs if cache is kept, also not called from queued jobs
	#if defined(DO_SPU_PROFILING)
				if(g_sInfoBlock.eaJobPerfAddress != 0)
				{
					const uint32 cJobEndTime		= spu_readch(SPU_RdDec);
					//decrementer reading takes at least 20 cycles, 40 cycles per tick currently
					g_PerfStats.spuJobTime			= jobTicks - 1;
					g_PerfStats.spuSetupTime		= (uint16)((cJobStartTime - cJobEndTime - 1) - jobTicks);
					//cache stats written by SPUMemManager, transfer back
					MemcpyMain(g_sInfoBlock.eaJobPerfAddress, (TAddrLS)&g_PerfStats, sizeof(SJobPerfStats), g_scDMAOutputTag);
				}
	#endif
				disableJobAtEnd = HandleCallback(jobFinished);
			}

			//transfer bucket headers back, transfer any garbage in front of it (16 bytes) to copy 128 bytes and get peak performance
			MemcpyMain
			(
				NSPU::NDriver::g_DestMemAreaEA,
				(void*)NSPU::NDriver::g_sMemMan.GetBucketSPUAddr(),
				128,
				g_scDMAOutputTag
			);

			//write via DMA a 0 to the job state address (the one tracking job slots, not the one informing waiters)
			if(disableJobAtEnd)
				MemcpyMain(g_sInfoPacketAddr, (TAddrLS)&g_sZero, 4, g_scDMAOutputTag);//located on the beginning of the info packet
#if defined(SUPP_SPU_FRAME_STATS)
			const uint32 cCurDecrCnt		= spu_readch(SPU_RdDec);
			const uint32 fullTickCount = (0xFFFFFFFFU - cCurDecrCnt);
			__spu_transfer_frame_stats(fullTickCount);
#endif
#if defined(SUPP_SPU_FRAME_STATS) || defined(DO_SPU_FUNCPROFILING)
			IF((NSPU::g_PageInfo.profilingEnabled != 0) && (jobFinished == 1), false)
#endif
			{
#if defined(SUPP_SPU_FRAME_STATS)
				TransferSPUJobFrameStats(fullTickCount, addJobInvoc);//update per job stats
#endif
#if defined(DO_SPU_FUNCPROFILING)
				TransferFuncProfStats();
#endif
			}
			ReleaseDebugging();
		}
	}//NDriver
}//NSPU

//dont rename, main is specified entry point by makefile
//main is relocated and cPacketInfo is set up by initial spu loader, up to 4 params are possible
//cPacketInfo[0] pull address for jobs
//cPacketInfo[1] destination area of memory with PPU communication
//cPacketInfo[2] SPageDirInfo address
//cPacketInfo[3] driver size (new memory is transferred straight behind it) (lower 24 bit, upper 8 bit SPU id)
__attribute__((noreturn))
int main(const NSPU::NDriver::SSpuParam cPacketInfo)
{
	NSPU::NDriver::g_sPageEA = cPacketInfo.data[2];
	NSPU::NDriver::ResetJobStorage();
	NSPU::NDriver::InitCache();
	uint32 availMemStart = cPacketInfo.data[3] & (255 | (255 << 8) | (255 << 16))/*driver size*/;
	//cPacketInfo tells where PPU pushes to and we have to pull from the individual job queue
	NSPU::NDriver::g_sPullEA	= (uint32)cPacketInfo.data[0] & ~127;//pull is 128 byte aligned
	const uint32 cMaxSPUs			= (uint32)cPacketInfo.data[0] & 127;//packed into the lower 7 bits
	//packet sync is right before pull
	NSPU::NDriver::g_sSPUPacketSyncEA = NSPU::NDriver::g_sPullEA + NPPU::scJobInfoBlocksSPUPacketSyncAddressDiff; 
	NSPU::NDriver::ResetLastProcessData();

	NSPU::NDriver::g_sLockActive = spu_splats((uint32)0);

	const uint32 cSPUID = (cPacketInfo.data[3] & (255 << 24)) >> 24;
	NSPU::NDriver::g_SPUId = cSPUID;

	//first SPU performs fast polling, others are activated on demand and poll with a lower rate
	const uint32 cIdleLoops = (cSPUID == 0)?FAST_POLLING_IDLE_LOOP_CNT : SLOW_POLLING_IDLE_LOOP_CNT;
	//init idle loops
	NSPU::NDriver::g_sLockActive = spu_insert(cIdleLoops, NSPU::NDriver::g_sLockActive, 1);

//	#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
		NSPU::g_sProgramTopLS = 0;
//	#endif

#if defined(SUPP_SN)
		{
			//write poll state into dedicated memory area to make the PPU know which SPU is actually doing work
			spu_mod_hdr *const __restrict pDriverModHeader = (spu_mod_hdr*)0;
			pDriverModHeader->pad = SPUPollState;
		}
#endif
#if defined(ENABLE_INTER_SPU_EVENTS)
	//only first SPU is allowed to poll initially
	NSPU::NDriver::g_sEventStatNextSPU = spu_insert
	(
		(cSPUID != 0)? STATE_EVENT_WAIT : STATE_EVENT_POLLING,
		NSPU::NDriver::g_sEventStatNextSPU,
		0
	);
#endif
	//initialize the signal status 
	uint32 nextSPUId = cSPUID + 1;		nextSPUId = (nextSPUId >= cMaxSPUs)?0 : nextSPUId;
	//in case we got only one 1 SPU, do not send signals at all
#if defined(ENABLE_INTER_SPU_EVENTS)
	NSPU::NDriver::g_sEventStatNextSPU = 
		spu_insert((cMaxSPUs <= 1)?STATE_NEXT_SPU_NOT_AVAIL : 
		(nextSPUId == 0)?STATE_NEXT_SPU_ACTIVATED : STATE_NEXT_SPU_DEACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);
	//initialize next SPU ID
	NSPU::NDriver::g_sEventStatNextSPU = spu_insert(nextSPUId, NSPU::NDriver::g_sEventStatNextSPU, 3);
#endif
#if defined(SUPP_PRINTF)
	NSPU::NDriver::g_DestPrintfAreaEA = cPacketInfo.data[1] + NPPU::scPrintfBufDiff + cSPUID * SPU_PRINTF_BUF_SIZE;
#endif

	NSPU::NDriver::g_sJobQueuePullBack.lockObtained = 0;//preset to write back

#if defined(DO_SPU_PROFILING)
	NSPU::NDriver::g_DestProfAreaEA		= cPacketInfo.data[1] + NPPU::scProfBufDiff + cSPUID * MAX_PROF_ID * (4*3);
#endif
	NSPU::NDriver::g_DestCustomCallbackAreaEA	= cPacketInfo.data[1] + NPPU::scCallbackBufDiff + cSPUID * sizeof(NPPU::SDMACallbackData);

#if defined(SUPP_SPU_FRAME_STATS)
	NSPU::NDriver::g_DestStatsAreaEA	= cPacketInfo.data[1] + NPPU::scStatsDiff;
#endif

	NSPU::NDriver::g_DestMemAreaEA = cPacketInfo.data[1] + cSPUID * SIZEOF_SPPUMEMREQUESTDATA;
	
	NSPU::NDriver::g_sMemMan.Init(NSPU::NDriver::g_DestMemAreaEA);	//init memory management

	//init page directory
	MemcpyLS(&NSPU::g_PageInfo, NSPU::NDriver::g_sPageEA, sizeof(NSPU::SPageDirInfo), g_scDMAPageTag0);	
	SyncMemory(g_scDMAPageTag0);

#if defined(SUPP_SPU_FRAME_STATS)
	{
		const uint32 cFrameProfileDataBaseEA	= NSPU::g_PageInfo.ppuSyncEA + sizeof(NPPU::SQueueNodeSPU) + 128 + 16;
		MemcpyLS(&NSPU::NDriver::g_FrameProfileDataBase, cFrameProfileDataBaseEA, 4, g_scDMAPageTag1);	//read CJobManSPU::m_pFrameProfileData, synced later
	}
#endif

#ifdef DO_SPU_FUNCPROFILING
	NSPU::NDriver::g_FuncProfData.funcProfTimingAreaEA = NSPU::g_PageInfo.funcProfTimingEA;
#endif
	assert((availMemStart & 0xF) == 0);//expect 16 byte alignment
	NSPU::g_GlobalSPUPageDir = (NSPU::SPageInfo*)availMemStart;
	const uint32 cPageDirSize = NSPU::AlignSize16(NSPU::g_PageInfo.pageNum * sizeof(NSPU::SPageInfo));
	availMemStart += cPageDirSize;
	MemcpyLS(NSPU::g_GlobalSPUPageDir, NSPU::g_PageInfo.pageDirEA, cPageDirSize, g_scDMAPageTag1);	
//	assert(scMaxSPUPageCount == 4);
	NSPU::g_SPUPageStates[0].Reset();
	NSPU::g_SPUPageStates[1].Reset();
	NSPU::g_SPUPageStates[2].Reset();
	NSPU::g_SPUPageStates[3].Reset();
	ResetPageDir();
	SyncMemory(g_scDMAPageTag1);

	//jobs expect this kind of alignment
	availMemStart = NSPU::AlignSize128(availMemStart);
	//keep grabbing jobs from the job queue
	while (1)
	{
/*
typedef union { char c4[4]; uint16_t u16[2]; uint32_t u32; } Module_u;
const Module_u s_mu = { { 'D', 'R', 'I', 'V' } };
__spu_insert_bookmark( 0xffaa );	// start marker 1
__spu_insert_bookmark( s_mu.u16[0] );	// name
__spu_insert_bookmark( s_mu.u16[1] );	// name
__spu_insert_bookmark( 0 );	// level
__spu_insert_bookmark( 0 );		// LSA
__spu_insert_bookmark( 0xffab );	// start marker 2
__spu_insert_bookmark( 0x3E0 );
typedef union { uint16_t u16[4]; uint32_t u32[2]; uint64_t u64; } GUID_u;
GUID_u guid;
qword cPageGUID = (qword)si_lqd((qword)spu_promote(0,0), 128);
qword insn = si_roti(cPageGUID, 7);
qword pattern = (qword)(vec_uchar16){0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13};
guid.u64 = si_to_ullong(si_shufb(insn, insn, pattern));
__spu_insert_bookmark( 0xffac );	// start marker 1
__spu_insert_bookmark( guid.u16[0] );	// guid
__spu_insert_bookmark( guid.u16[1] );	// guid
__spu_insert_bookmark( guid.u16[2] );	// guid
__spu_insert_bookmark( guid.u16[3] );	// guid
__spu_insert_bookmark( 0xffad );	// start marker 2
__spu_insert_bookmark( 0x7C00 );
*/
#if defined(ENABLE_INTER_SPU_EVENTS)
		if(spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 0) == STATE_EVENT_POLLING)
#endif
		{
			while(!NSPU::NDriver::GetJobFromJobQueue(cSPUID))
			{
				//check if we are told to change state into wait
#if defined(ENABLE_INTER_SPU_EVENTS)
				if(!spu_extract(NSPU::NDriver::g_sLockActive, 0))
					IF(spu_readchcnt(SPU_RdSigNotify2), false)
						goto SkipProcessingJob;
#endif
				//do some idle loops to relieve the bus a bit
				const uint32 cCurIdleLoops = spu_extract(NSPU::NDriver::g_sLockActive, 1);
				for(uint32 i=0; i<cCurIdleLoops; ++i)
				{
					asm volatile("nop");
					asm volatile("nop");
				}
			}
				//we got a job
#if defined(DO_SPU_PROFILING) 
				NSPU::NDriver::g_JobFetchTime = 0xFFFFFFFFU - spu_readch(SPU_RdDec);//time from job signal to here
#endif
#if defined(DO_SPU_PROFILING) || defined(SUPP_SPU_FRAME_STATS)
				spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif
#if defined(SUPP_SN)
				{
					//write processing state into dedicated memory area to make the PPU know which SPU is actually doing work
					spu_mod_hdr *const __restrict pDriverModHeader = (spu_mod_hdr*)0;
					pDriverModHeader->pad = SPURunState;
				}
#endif
				NSPU::NDriver::ProcessJob(availMemStart, cSPUID, cMaxSPUs);
#if defined(SUPP_SN)
				{
					//write poll state into dedicated memory area to make the PPU know which SPU is actually doing work
					spu_mod_hdr *const __restrict pDriverModHeader = (spu_mod_hdr*)0;
					pDriverModHeader->pad = SPUPollState;
				}
#endif
#if defined(ENABLE_INTER_SPU_EVENTS)
			//we have finished processing the job, notify next spu that we are back idle
			const bool cStateNextSPUActive = (cMaxSPUs > 0) && ((spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 1) == STATE_NEXT_SPU_ACTIVATED));
			if(cStateNextSPUActive)
			{
				//send STATE_WAIT signal to next SPU
				NSPU::NDriver::SendSignal2(STATE_EVENT_WAIT, spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 3));
				NSPU::NDriver::g_sEventStatNextSPU = spu_insert(STATE_NEXT_SPU_DEACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);//change state
			}
#endif
			//reset idle loops to fast polling
			NSPU::NDriver::g_sLockActive = spu_insert(FAST_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
#if !defined(ENABLE_INTER_SPU_EVENTS)
			//write poll state into dedicated memory area to make the PPU know which SPU is actually doing work
			spu_mod_hdr *const __restrict pDriverModHeader = (spu_mod_hdr*)0;
			pDriverModHeader->pad = SPUPollState;
#endif
		}
#if defined(ENABLE_INTER_SPU_EVENTS)
		else
		{
SkipProcessingJob:
			//wait for signal event from previous SPU (can only happen for SPU ID 1..4)
#if defined(SUPP_SN)
			//write wait state into dedicated memory area to make the PPU know which SPU is actually doing work
			spu_mod_hdr *const __restrict pDriverModHeader = (spu_mod_hdr*)0;
			pDriverModHeader->pad = SPUWaitState;
#endif
			const uint32 cSignalRes = (uint32)spu_readch(SPU_RdSigNotify2);
#if defined(SUPP_SN)
			//write poll state into dedicated memory area to make the PPU know which SPU is actually doing work
			pDriverModHeader->pad = SPUPollState;
#endif
			NSPU::NDriver::g_sEventStatNextSPU = spu_insert(cSignalRes, NSPU::NDriver::g_sEventStatNextSPU,	0);
		}
#endif
	}
}

#endif //__SPU__
#endif //PS3
