/* 
	implementation of spu driver and its functions
	mechanism: - main is never exiting loop, it grabs jobs by polling and locking the jobs from PPU job queue (1 shared by all SPUs)
			  - each jobs have a certain order of data to be transferred to LS
				- can have dependent jobs to call next, so a certain order is guaranteed without having the need to join or wait on some sync mechanism
				- each job can have multiple input/output packets (memory locations must not depend on the previous jobs)
				- for all kind of callbacks, check job state variable first, this is sync't against output DMA
				- stack when entered main is 752 bytes below the top address -> temp storage
				- it copies the job info block to local store, processes it by dma in the input memory mapping, job code and data 
				- it starts the job by calling the Execute function (always the entry point) by passing the pointer to the parameter area 
	polling behavior:
				- SPU ID0 initially polls with a high rate, the others wait for an event
				- once the pull pointer has been locked, the polling rate is set to high and no further locks are issued
				- if a SPU sees a locked pulling pointer not locked by itself, it resets to a slow polling rate
				- once a job has been fetched, the next SPU (SPU with ID 0 -> SPU with ID 1 and so on) is signaled to start polling
				- once a SPU has finished processing a job chain, the next SPU is signaled to enter wait state
				  (because it does itself change to polling state), the polling frequency is reseted to high

	loops:- each job can have >= 1 packets which are basically consecutive function calls
					- cache is flushed after all have been finished
				- if a job runs via a producer / consumer queue, each packet flushes the cache
					- it cannot have the inner loop packets as above in this mode

	memory layout:

	The LS memory map is organized as follows:
	[TOP Address = 0]
		SPUDriver including Cache
		Bubble directory
		per Job:
			Info packet (SInfoBlock)
			DMA list data (SDMAElement[])
			job function code and data
			parameter area (input into PFNEXECUTE)		
			memory transferred via DMA list
		Stack (at 752 below top)
		384 bytes Initial Loader / 4 * 128 byte static areas for copying stuff here
	[BOTTOM Address = scLSSize]
	
*/ 

#if defined(PS3)
#if defined(__SPU__)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <platform.h>
#include <stdio.h>
#include <spu_intrinsics.h>
#include <sys/spu_thread.h>
#include "SPU.h"
#include <IJobManSPU.h>
#include "../PPU/PPU.h"
#include "Memory.h"
#include "SPUMemManager_spu.h"
//include header files for dependency generation (to force recompilation of all jobs)
#include <SPUJobBase.h>
#include <SPUJob.h>
#include "SPUUtilities.h"
#include "./Cache/CacheDefs_spu.h"
#include "CodePage/SPUBubbles.h"
#include "CodePage/BubbleLayout.h"
#include "../PPU/ProdConsQueue.h"

#if defined(SPU_CACHE_MISS_USE_ASM)
	extern "C" void FlushCacheComplete(const bool cDoSync);
#endif

#if defined(SUPP_SN)
	#include <libsn_spu.h>
#endif

namespace NSPU
{
	namespace NCache
	{
		extern vec_uint4* __restrict g_pSPUShadowCache;
		extern vec_uint4 g_AsyncRangesDirFrom;
		extern vec_uint4 g_AsyncRangesDirTo;
		extern uint32 g_CurSPUAsyncTag _ALIGN(16);
		extern vec_uint4 g_SPUAsyncDir;
		extern vec_uint4 g_PrefetchLRUDir;
		extern vec_uint4 g_PrefetchDir;
		extern uint32 g_SPUNumSets _ALIGN(16);
		extern uint32 g_CurAtomicEA _ALIGN(16);
#if defined(ENABLE_HAZARD_MODE)
		extern uint32 g_AtomicEAToStart _ALIGN(16);
#endif
#if defined(DO_SPU_PROFILING)
		extern vec_uint4 g_SPUCacheHitIncr;
		extern vec_uint4 g_SPUCacheProfIDCounter[];
		extern vec_uint4 g_SPUCacheCurProfID;
#endif
	}
	using NCache::g_SPUNumSets;

	namespace NCodePaging
	{
		extern const bool SetActiveBubbles(const int, const int, const int, const int);
	}

	extern SBubbleInfo *g_GlobalSPUBubbleDir _ALIGN(16);
	extern SBubbleState g_SPUBubbleStates[scMaxSPUBubbleCount] _ALIGN(16);
	extern vec_int4 g_SPUBubbleDir;
	extern vec_uint4 g_SPUBubbleLRUDir;			
	extern vec_uint4 g_BubbleLRUCounter;		
	extern uint8* __restrict g_SPUBubbleMem;
	extern uint32 g_SPUBubbleSize _ALIGN(16);
	extern vec_uint4 g_BubbleMemLower;
	extern vec_uint4 g_BubbleMemUpper;
	extern SReturnStackEntry g_ReturnStack[RETURN_STACK_MAX_ENTRIES] _ALIGN(16);
	extern SReturnStackEntry *g_pReturnStackTop;
	extern vec_uint4 g_CurBubIndex;
	extern vec_uint4 g_SPUBubbleIndexMask0;		
	extern vec_uint4 g_SPUBubbleIndexMaskShl4;
	extern SBubbleDirInfo g_BubInfo _ALIGN(16);


#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
	uint32 g_sProgramTopLS _ALIGN(16);//program top address
#endif 

	namespace NDriver 
	{
		//one cache line local buffer, also used by cache miss handler and callback sync
		uint8 g_sLSBuffer[128] _ALIGN(128) _DATASEC;
		//static variables to save stack and register passing of arguments
		SInfoBlock g_sInfoBlock _ALIGN(128) _DATASEC; 
		CSPUMemMan g_sMemMan _ALIGN(128) _DATASEC;

		static SJobQueuePos g_sJobQueuePullBack;
		static uint32 g_sInfoPacketAddr _ALIGN(16);
		static uint32 g_sPullEA _ALIGN(16);
		static uint32 g_sCurPullPtr _ALIGN(16);
		static uint32 g_sSPUPacketSyncEA _ALIGN(16);
		static uint32 g_sJobInfoBlockEA _ALIGN(16);

#if defined(SUPP_SN)
		uint32 g_sDebugState _ALIGN(16);
#endif
		//informs the PPU that debugging has been finished
		__attribute__((always_inline))
		inline void ReleaseDebugging()
		{
#if defined(SUPP_SN)
			if(g_sDebugState)
				spu_writech(SPU_WrOutIntrMbox, NPPU::scDebugCallbackPort | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
#endif
		}

		//word 0: last program EA
		//word 1: last bubble mode
		static vec_uint4 g_sLastProcessData;
		//word 0: 1 if SPU has obtained the pull locked
		//word 1: current idle count
		static vec_uint4 g_sLockActive _ALIGN(16);
		#define FAST_POLLING_IDLE_LOOP_CNT 0			//idle loop count for SPU 0 and if lock is to be obtained for the 1st time
		#define SLOW_POLLING_IDLE_LOOP_CNT 8000		//idle loop count for SPU 1..4 and if lock has been obtained by another SPU

		uint32 g_DestMemAreaEA _ALIGN(16);
		//word JOB_SPAWN_PUSH_WORD: push pointer if to be set at parents exit (same AS curAddr in PPU::SJobQueuePos)
		//word JOB_SPAWN_STATE_WORD: turns 1 if a job has been spawned (callback or external job state must not be called)
		vec_uint4 g_JobSpawnRegister;	
		static vec_uint4 g_sZero;		//need some 0 to transfer
#if defined(DO_SPU_PROFILING)
		SJobPerfStats g_PerfStats _ALIGN(16);
		uint32 g_DestProfAreaEA;
		uint32 g_JobFetchTime;
#endif
#if defined(SUPP_SPU_FRAME_STATS)
		uint32 g_DestStatsAreaEA;
		uint32 g_FrameProfileDataBase;
#endif
#if defined(SUPP_PRINTF)
		uint32 g_DestPrintfAreaEA _ALIGN(16);
#endif
		//to avoid polling on n the bus for the SPUs, signals are send
		//only IDs 0..3 are sending, ID 4 does not do anything
		//word 0: status if to poll for a job or not
		#define STATE_EVENT_POLLING 0
		#define STATE_EVENT_WAIT 1
		//word 1: event status of next SPU
		#define STATE_NEXT_SPU_ACTIVATED 0
		#define STATE_NEXT_SPU_DEACTIVATED 1
		#define STATE_NEXT_SPU_NOT_AVAIL 2
		//word 2: available
		//word 3: ID of SPU to send a signal to
		static vec_uint4 g_sEventStatNextSPU _ALIGN(16);
		static uint32 g_SignalBuffer[4] _ALIGN(16);//signal buffer for mmio

		//enable it if the lock line lost reserv.event is used to wait for an update on the push pointer 
		#define LOCK_USES_LLAR_EVENT

		__attribute__((always_inline))
		inline const uint32 GetLastProgramEA()
		{
			return spu_extract(g_sLastProcessData, 0);
		}

		__attribute__((always_inline))
		inline void SetLastProgramEA(const uint32 cEA)
		{
			g_sLastProcessData = spu_insert(cEA, g_sLastProcessData, 0);
		}

		__attribute__((always_inline))
		inline void ResetLastProcessData()
		{
			g_sLastProcessData = (vec_uint4)0;
			g_sLastProcessData = spu_insert(0xFFFFFFFF, g_sLastProcessData, 1);
		}

		__attribute__((always_inline))
		inline const NPPU::EBubbleMode GetLastBubbleMode()
		{
			return (NPPU::EBubbleMode)spu_extract(g_sLastProcessData, 1);
		}

		__attribute__((always_inline))
		inline void SetLastBubbleMode(const NPPU::EBubbleMode cMode)
		{
			g_sLastProcessData = spu_insert((uint32)cMode, g_sLastProcessData, 1);
		}

		__attribute__((always_inline))
		inline void ResetJobSpawnReg()
		{
			g_JobSpawnRegister = spu_splats((uint32)0);
		}

		__attribute__((always_inline))
		inline const bool IsJobSpawned()
		{
			return spu_extract(g_JobSpawnRegister, JOB_SPAWN_STATE_WORD) != 0;
		}

		__attribute__((always_inline))
		inline const uint32 RetrieveJobSpawnPush()
		{
			return spu_extract(g_JobSpawnRegister, JOB_SPAWN_PUSH_WORD);
		}

		__attribute__((always_inline))
		inline void ResetJobSpawnPush()
		{
			g_JobSpawnRegister = spu_insert((uint32)0, g_JobSpawnRegister, JOB_SPAWN_PUSH_WORD);
		}

		__attribute__((always_inline))
		inline const uint32 GetJobSpawnPushLS()
		{
			return (uint32)&g_JobSpawnRegister + JOB_SPAWN_PUSH_WORD * sizeof(int);
		}

		__attribute__((always_inline))
		inline const uint32 GetJobSpawnStateLS()
		{
			return (uint32)&g_JobSpawnRegister + JOB_SPAWN_STATE_WORD * sizeof(int);
		}

		__attribute__((always_inline))
		inline void HandleJobSpawn()
		{
			const uint32 cPushEA = RetrieveJobSpawnPush();
			IF(IsJobSpawned() && cPushEA != 0, false)
			{
				//unlock PPU job queue and transfer new push address back
				const uint32 cPushEA			= g_BubInfo.ppuSyncEA + offsetof(SJobQueuePos, curAddr) - offsetof(SJobQueuePos, lockObtained);
				const uint32 cSpinLockEA	= g_BubInfo.ppuSyncEA + sizeof(NPPU::SQueueNodeSPU);
				assert((cPushEA & 0xF) == (JOB_SPAWN_PUSH_WORD * sizeof(int)));
				//important set push pointer last (so that all data are valid once the other SPUs fetch them)
				assert((GetJobSpawnPushLS() & 0xF) == (cPushEA & 0xF));
				MemcpyMainFenced(cPushEA, (volatile TAddrLS)GetJobSpawnPushLS(), sizeof(int), 0);
				//now unlock
				assert((GetJobSpawnStateLS() & 0xF) == (cSpinLockEA & 0xF));
				MemcpyMainFenced(cSpinLockEA, (volatile TAddrLS)GetJobSpawnStateLS(), 4, 0);
				//sync in place since next packet could spawn new job
				spu_writech(MFC_WrTagMask, 1<<0);
				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
				spu_readch(MFC_RdTagStat);
				//reset push address
				ResetJobSpawnPush();
			}
		}

		//locks a job entry position from SPU in main memory using mfc atomics
		//inlined since it is just called once
		__attribute__((always_inline))
		inline const bool Lock(const uint32 cSPUID)
		{
			//if we have not already obtained the lock, just copy current contents here
			int status;
			uint32 event;
			volatile SJobQueuePos *pLS = (volatile SJobQueuePos*)g_sLSBuffer;
			mfc_getllar_prep(pLS, g_sPullEA);
			do
			{
				status = 1;
				mfc_getllar_again();
				mfc_read_atomic_status();
				IF(pLS->lockObtained == 0, true)
				{
					//write SPU, this way PPU knows which SPU currently has the lock (for profiling/debugging)
					pLS->lockObtained = cSPUID+1;
					mfc_putllc_again();
					status = mfc_read_atomic_status();
				}
				else
				{
					//another SPU has locked the queue, exit and do some idle loops
					return false;
				}
			}
			while(status != 0);
			//update current pull pointer on PPU (only this SPU can now change it)
			g_sCurPullPtr = pLS->curAddr;
			g_sLockActive = spu_insert((uint32)1, g_sLockActive, 0);//this SPU has obtained the lock
			return true;
		}

		//gets a job from the job queue
		//returns true if a job has been pulled from, false if no job is currently available
		__attribute__((always_inline))
		inline const bool GetJobFromJobQueue(const uint32 cSPUID)
		{
			const uint32 cPushEA = g_sPullEA - NPPU::scJobInfoPushPullAddressDiff;
			volatile uint8 pushLSBuffer[128] _ALIGN(128);
			volatile SJobQueuePos *pLS = (volatile SJobQueuePos*)pushLSBuffer;
			//fetch job til we find a fetchable one, if pull pointer hits push pointer, break and idle a bit
			while(1)
			{
				if(spu_extract(g_sLockActive, 0) == 0)
				{
					if(!Lock(cSPUID))
					{
						//switch to slow polling since another SPU has obtained the lock, this way we occupy the bus least
						NSPU::NDriver::g_sLockActive = spu_insert(SLOW_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
						return false;//lock was acquired by another SPU, exit and do some idle loops
					}
					NSPU::NDriver::g_sLockActive = spu_insert(FAST_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
				}
#if defined(LOCK_USES_LLAR_EVENT)
				//get a reservation for the push pointer and use the lost reserv.event to quickly react to a write there
				//since only this SPU has acquired the lock, the pull pointer does not need to be checked
				uint32 llEvent;
	#if !defined(FAST_UNSAFE_LL_ENABLE)
				spu_write_event_mask(0);//discard previous (or phantom) events, as needed
				IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
				{
					llEvent = spu_readch(MFC_RD_EVENT_STATUS);
					spu_writech(MFC_WR_EVENT_ACK, llEvent);
				}
	#endif//FAST_UNSAFE_LL_ENABLE
				spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
				mfc_getllar_prep((void*)pushLSBuffer, cPushEA);//transfer push here
				while(1)
				{
					mfc_getllar_again();//transfer push here
					mfc_read_atomic_status();
					IF(pLS->curAddr == g_sCurPullPtr, false)
					{
						//snoop on a write to push
						llEvent = spu_readch(MFC_RD_EVENT_STATUS);
						spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
#if defined(DO_SPU_PROFILING)
						spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif
						continue;
					}
					break;
				}
#else
				//get memory here (otherwise we have done it already through Lock)
				//check if there is a new job to be pulled from PPU queue (check push pointer)
				MemcpyLSNoDebug((void*)pushLSBuffer, cPushEA, 128, 0);
				SyncMemory(0);
				//assume false to have it faster if there is a job, otherwise we are happy to occupy the bus less
				IF(pLS->curAddr == g_sCurPullPtr, false)
					return false;
#if defined(DO_SPU_PROFILING)
				spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif
#endif//LOCK_USES_LLAR_EVENT
				//pop the job entry among its parameters
				MemcpyLSNoDebug((TAddrLS)&g_sInfoBlock, g_sCurPullPtr, sizeof(SInfoBlock), 1);
				g_sInfoPacketAddr = g_sCurPullPtr;
				int status;
				//update pull pointer and write back unconditional
				uint32 addr = g_sCurPullPtr + NSPU::NDriver::scSizeOfSJobQueueEntry;
				g_sJobQueuePullBack.curAddr		= CondSelEq(addr, pLS->topAddr, pLS->baseAddr, addr);
				g_sJobQueuePullBack.topAddr		= pLS->topAddr;
				g_sJobQueuePullBack.baseAddr	= pLS->baseAddr;
				SyncMemory(1);
				IF(g_sInfoBlock.IsFetchable(), true)//if we have found an invalid entry (which is already in progress from last queue loop), fetch next one
				{
					g_sLockActive = spu_insert((uint32)0, g_sLockActive, 0);//reset locking info
					//do not unlock before we have actually found a valid job
					MemcpyMain(g_sPullEA, (TAddrLS)&g_sJobQueuePullBack, 16, 0);
#ifdef LOCK_USES_LLAR_EVENT
					spu_write_event_mask(0);
	#if !defined(FAST_UNSAFE_LL_ENABLE)
					IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
					{
						llEvent = spu_readch(MFC_RD_EVENT_STATUS);
						spu_writech(MFC_WR_EVENT_ACK, llEvent);
					}
	#endif//FAST_UNSAFE_LL_ENABLE
#endif //LOCK_USES_LLAR_EVENT
					break;
				}
				//update pull pointer
				g_sCurPullPtr = g_sJobQueuePullBack.curAddr;
			}
			return true;
		}

#if defined(SUPP_SPU_FRAME_STATS)
		//updates the job tick count non atomically
		__attribute__((always_inline))
		inline void TransferSPUJobFrameStats(volatile vec_uint4* __restrict pFrameStatBuf, const uint32 cTicks, const uint32 cAddJobInvoc)
		{
	#if defined(MEASURE_TIMEOUT)
			SyncMemory(g_scDMAPPUMemTag);//transfer of bucket headers, only required once
	#else
			//faster version of syncing
			spu_writech(MFC_WrTagMask, (1<<g_scDMAPPUMemTag));
			spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
			spu_readch(MFC_RdTagStat);
	#endif	//MEASURE_TIMEOUT
			const uint32 cEA = spu_extract(*pFrameStatBuf, g_sInfoBlock.frameProfIndex & 0x3);
			//copy here current usec count
			MemcpyLS(pFrameStatBuf, cEA, 16, g_scDMAPPUMemTag);
	#if defined(MEASURE_TIMEOUT)
			SyncMemory(g_scDMAPPUMemTag);//transfer of bucket headers, only required once
	#else
			//faster version of syncing
			spu_writech(MFC_WrTagMask, (1<<g_scDMAPPUMemTag));
			spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
			spu_readch(MFC_RdTagStat);
	#endif	//MEASURE_TIMEOUT
			const float cInvTB = 1000.f / 79800.f;//inverse ticks per usec
			//encode the conversion of decrementer ticks to usecs here explicitly
			*pFrameStatBuf  = spu_insert(spu_extract(*pFrameStatBuf, 0) + (uint32)((float)cTicks * cInvTB), *pFrameStatBuf, 0);
			*((uint32*)pFrameStatBuf + 1) += cAddJobInvoc;
			//transfer back, sync because of stack usage
			MemcpyMain(cEA, pFrameStatBuf, 16, g_scDMAPPUMemTag);
#if defined(MEASURE_TIMEOUT)
			SyncMemory(g_scDMAPPUMemTag);//transfer of bucket headers, only required once
#else
			//faster version of syncing
			spu_writech(MFC_WrTagMask, (1<<g_scDMAPPUMemTag));
			spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
			spu_readch(MFC_RdTagStat);
#endif	//MEASURE_TIMEOUT
		}
#endif

		//returns true if we have to transfer a zero back for an external job state tracking
		//use case without branches expects a callback and an independent job
		//works as follows: (lock is located i  first byte of cache line)
		//	- get sync cache line from PPU mem
		//	- if not already 0, loop til to obtained lock, otherwise toggle callback
		//	- write back decremented count transfer back atomically, loop til successful
		//	- unlock
		//returns true if job needs to be disabled by transferring a 0 to the info block
		//rJobFinished is set to 1 if not multiple SPUs are still processing job
		//if a job has been spawned, it just adds
		__attribute__((always_inline))
		inline const bool HandleCallback(uint32& rJobFinished)
		{
			//now toggle callback function if requested, callback is 16 bit index
			//	do it early since it takes long time and the job state has to be tested anyway (which is DMA'd with barrier)
#if defined(_NO_SPU_ASSERT)
			//in the assert case, a wrong branch hint is set
			IF(g_sInfoBlock.callbackIndex == SInfoBlock::scNoIndex && g_sInfoBlock.GetExtJobStateAddress() == 0, false)
#else
			if(g_sInfoBlock.callbackIndex == SInfoBlock::scNoIndex && g_sInfoBlock.GetExtJobStateAddress() == 0)
#endif
				return true;
			IF(g_sInfoBlock.spuPacketSyncIndex != SInfoBlock::scNoIndex, false)
			{
				//decrease counter on PPU atomically if it is not 0, otherwise we have to trigger the callback
				//this way it is ensured that only one SPU toggles it
				uint32 event;
				int status;
				volatile uint8 *pSyncBuffer = (volatile uint8*)g_sLSBuffer;
#if !defined(_NO_SPU_ASSERT)
				volatile int counter = 0;
#endif
				mfc_getllar_prep(pSyncBuffer, g_sSPUPacketSyncEA);
				do
				{
					status = 1;
					mfc_getllar_again();
					mfc_read_atomic_status();
					IF(pSyncBuffer[g_sInfoBlock.spuPacketSyncIndex] == 0, false)
					{
						rJobFinished = 1;
						if(g_sInfoBlock.GetExtJobStateAddress() != 0)
						{
							MemcpyMainFenced(g_sInfoBlock.GetExtJobStateAddress(), (volatile TAddrLS)&g_sZero, 16, g_scDMAOutputTag);
							return true;
						}
						spu_writech(SPU_WrOutIntrMbox, (uint32)g_sInfoBlock.callbackIndex | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
						return false;//callback does it
					}
					IF(pSyncBuffer[0] == 0, true)//still unlocked
					{
						pSyncBuffer[0] = 1;
						mfc_putllc_again();
						status = mfc_read_atomic_status();
					}
#if !defined(_NO_SPU_ASSERT)
					assert(counter++ < 10);
#endif
				}
				WHILE(status != 0, false);

				//decrement and write back atomically
#if !defined(_NO_SPU_ASSERT)
				counter = 0;
#endif
				do
				{
					mfc_getllar_again();
					mfc_read_atomic_status();
					pSyncBuffer[0] = 0;//unlock
					--pSyncBuffer[g_sInfoBlock.spuPacketSyncIndex];//decrement
					mfc_putllc_again();
					status = mfc_read_atomic_status();
#if !defined(_NO_SPU_ASSERT)
					assert(counter++ < 10);
#endif
				}
				WHILE(status != 0, false);
				return true;
			}
			rJobFinished = 1;
			IF(!IsJobSpawned(), true)//no multiple packets
			{
				if(g_sInfoBlock.GetExtJobStateAddress() != 0)
				{
					MemcpyMainFenced(g_sInfoBlock.GetExtJobStateAddress(), (volatile TAddrLS)&g_sZero, 16, g_scDMAOutputTag);
					return true;
				}
				spu_writech(SPU_WrOutIntrMbox, (uint32)g_sInfoBlock.callbackIndex | (EVENT_CALLBACK_PORT << EVENT_PORT_SHIFT));
				return false;//callback does it
			}
			return true;
		}

		__attribute__((always_inline))
		inline void SendSignal2(const uint32 cSignal, const uint32 cSPUId)
		{
			//write via mfc to the problem register of the SPU cSPUId
			const uint32 cSNR2Address = 
				NPPU::scPCRawSPUOffset * cSPUId + NPPU::scPCRawSPUBaseAddr + NPPU::scPCRawSPUProbOffset + NPPU::scPCSigNotify2;
			g_SignalBuffer[3] = cSignal;
			spu_mfcdma32(&g_SignalBuffer[3], cSNR2Address, sizeof(uint32), 2, MFC_PUT_CMD);
		}

		__attribute__((always_inline))
		inline void PatchJobEntry(const NBubBin::SJob* const cpJob, const uint32 cBubMemAddr)
		{
			//patches branch and its hint for the job to the respective entry function of its first bubble
			const uint32 cDestBubLS = cBubMemAddr + ((uint32)cpJob->destBubbleOff << 2);
			uint32* pBranchInstr	= (uint32*)&((uint8*)cpJob)[cpJob->branchOff];
			const uint32 cEncodedBranchDest = ((cDestBubLS >> 2) << 7);
			*pBranchInstr |= cEncodedBranchDest;//encode absolute branch destination
			//if no branch hint exists, the offset points to the branch instruction itself (same value applied)
			uint32* pBHInstr = (uint32*)&((uint8*)cpJob)[cpJob->bhOff];
			*pBHInstr				|= cEncodedBranchDest;//encode absolute branch hint instruction
		}

		//prefetch first bubbles and sync the first one itself
		__attribute__((always_inline))
		inline void PrefetchBubbles(const NBubBin::SJob* const cpJob, const uint32 cDecrVal)
		{
			//now stream in other bubbles
			uint32 curBubMem = (uint32)g_SPUBubbleMem;
			for(int i=1; i<scMaxSPUBubbleCount; ++i)
			{
				const int16 cBubIndex = cpJob->initialBubbles[i];
				if(cBubIndex != -1)//test if set at all
				{
					const uint32 cBubEA		= g_GlobalSPUBubbleDir[cBubIndex].ea;
					const uint32 cBubSize = g_GlobalSPUBubbleDir[cBubIndex].size;
					MemcpyLargeLS((volatile TAddrLS)curBubMem, cBubEA, cBubSize, g_scDMABubbleTag0+i);

#if defined(DO_SPU_PROFILING)
					g_PerfStats.bubMemTransferred  += cBubSize;
					++g_PerfStats.bubblesTransferred;
#endif

					curBubMem += g_SPUBubbleSize;
					g_SPUBubbleStates[i].curIndex = cBubIndex;
					g_SPUBubbleStates[i].curState = BUB_STATE_STREAMING;
					g_SPUBubbleStates[i].transDecrEnd	= cDecrVal - (cBubSize / BYTES_PER_DECR_TICK);
					SetBubbleIndex(cBubIndex, i);
				}
			}

			//sync first bubble
			if(g_SPUBubbleStates[0].curState != BUB_STATE_READY)
			{
		#if defined(_DEBUG)
				SyncMemory(g_scDMABubbleTag0);
		#else
				const uint32 cTagMask = (1<<g_scDMABubbleTag0);
				g_SPUBubbleStates[0].curState = BUB_STATE_READY;
				spu_writech(MFC_WrTagMask, cTagMask);
				spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL);
				spu_readch(MFC_RdTagStat);
		#endif //_DEBUG
			}			
		}

		__attribute__((always_inline))
		inline void SetupBubbles(TAddrLS& rpCurAddr, const NPPU::EBubbleMode cBubbleMode, const uint16 cFirstBubbleIndex)
		{
			const NPPU::EBubbleMode cLastBubbleMode = GetLastBubbleMode();
			//if last mode is different to current one, reset LRU and dir
			const vec_int4 cResetBubDir = spu_splats(-1);
			g_SPUBubbleDir = ((int)cLastBubbleMode != (int)cBubbleMode)?cResetBubDir : g_SPUBubbleDir;
			//get first bubble, transfer takes the longest
			if(g_SPUBubbleStates[0].curIndex != cFirstBubbleIndex)
			{
				const uint32 cBubEA		= g_GlobalSPUBubbleDir[cFirstBubbleIndex].ea;
				const uint32 cBubSize = g_GlobalSPUBubbleDir[cFirstBubbleIndex].size;
				MemcpyLargeLS(g_SPUBubbleMem, cBubEA, cBubSize, g_scDMABubbleTag0);
				g_SPUBubbleStates[0].curIndex			= cFirstBubbleIndex;
				g_SPUBubbleStates[0].curState			= BUB_STATE_STREAMING;//will be ready before job starts (forced)
				SetBubbleIndex(cFirstBubbleIndex, 0);
#if defined(DO_SPU_PROFILING)
				g_PerfStats.bubMemTransferred  += cBubSize;
				++g_PerfStats.bubblesTransferred;
#endif
			}			
#if defined(DO_SPU_PROFILING)
			g_PerfStats.firstBubbleSize  = g_GlobalSPUBubbleDir[cFirstBubbleIndex].size >> 4;
#endif
			NSPU::g_SPUBubbleStates[1].Reset();
			NSPU::g_SPUBubbleStates[2].Reset();
			NSPU::g_SPUBubbleStates[3].Reset();

			//set up bubbles
			uint32 bubbleMemSize;
			switch(cBubbleMode)
			{
			case NPPU::eBM_Single : g_SPUBubbleSize = g_GlobalSPUBubbleDir[cFirstBubbleIndex].size;  
				bubbleMemSize = g_SPUBubbleSize;	break;
			case NPPU::eBM_4x8		: bubbleMemSize = 4 *  8 * 1024; g_SPUBubbleSize = 8 * 1024;  break;
			case NPPU::eBM_4x16		: bubbleMemSize = 4 * 16 * 1024; g_SPUBubbleSize = 16 * 1024; break;
			case NPPU::eBM_4x32		:	bubbleMemSize = 4 * 32 * 1024; g_SPUBubbleSize = 32 * 1024; break;
			}
			SetLastBubbleMode(cBubbleMode);

			//set the lower and upper address of the bubble memory, used to compare against it with spu_cgt in CodePagingCallMissHandler(asm)
			g_BubbleMemLower				= spu_promote((uint32)g_SPUBubbleMem, 0);
			uint32 incrBubBaseAddr	= g_SPUBubbleSize + (uint32)g_SPUBubbleMem;
			g_BubbleMemLower				= spu_insert(incrBubBaseAddr, g_BubbleMemLower, 1);
			incrBubBaseAddr					+= g_SPUBubbleSize;
			g_BubbleMemLower				= spu_insert(incrBubBaseAddr, g_BubbleMemLower, 2);
			incrBubBaseAddr					+= g_SPUBubbleSize;
			g_BubbleMemLower				= spu_insert(incrBubBaseAddr, g_BubbleMemLower, 3);
			g_BubbleMemUpper				= spu_add(g_BubbleMemLower, g_SPUBubbleSize);
			ResetBubbleLRUCounter();
			ResetBubbleLRU();
			UpdateBubbleLRUByIndex(0, 1);//make sure first bubble does not get replaced
			IncrementPointer(rpCurAddr, bubbleMemSize);

			//reset return stack
			const SReturnStackEntry cTopEntry = {0, SReturnStackEntry::cIsJobBubble, 0, 0, 0, 0, 0, 0};
			//push a first index (which is the job call to the first bubble)
			//top index is the offset to the current top index (0, 16, 32..) (16 = sizeof(SReturnStackEntry))
			g_pReturnStackTop			= &g_ReturnStack[0];
			g_ReturnStack[0]			= cTopEntry;
			g_CurBubIndex					= (vec_uint4)0;
		}

		__attribute__((always_inline))
		inline void ResetCacheControl()
		{
			//reset cache
			const vec_uint4 cZero = spu_splats((uint32)0);
			const int cNumSets = g_SPUNumSets;
			//reset cache dir entries, 4 at once to give the branch hint a chance to be set
			for(int set=0; set<cNumSets; set += 4)
			{
				g_pSPUCacheLRUCtrl[set]		= cZero;
				g_pSPUCacheDir[set]				= cZero;
				g_pSPUCacheLRUCtrl[set+1] = cZero;
				g_pSPUCacheDir[set+1]			= cZero;
				g_pSPUCacheLRUCtrl[set+2] = cZero;
				g_pSPUCacheDir[set+2]			= cZero;
				g_pSPUCacheLRUCtrl[set+3] = cZero;
				g_pSPUCacheDir[set+3]			= cZero;
			}
			g_LRUCounterIncr = g_LRUCounter = cZero;//reset LRU counters
			NCache::g_AsyncRangesDirFrom		= spu_splats((uint32)0xFFFFFFFF);
			NCache::g_AsyncRangesDirTo			= cZero;
#if defined(SUPP_ASYNC)
			NCache::g_CurSPUAsyncTag				= 0;
			NCache::g_SPUAsyncDir						= cZero;
#endif
#if defined(PREFETCH)
			NCache::g_PrefetchLRUDir	= NCache::g_PrefetchDir = cZero;
#endif
		}
		
		__attribute__((always_inline))
		inline void InitCache()
		{
			const vec_uint4 cZero = spu_splats((uint32)0);
#if defined(PREFETCH) && !defined(SPU_CACHE_MISS_USE_ASM)
			register vec_uint4 g_CurWrittenEA __asm__ ("$79"); 
			g_CurWrittenEA	  = (vec_uint4)0; 
#endif

#if defined(DO_SPU_PROFILING)
			NCache::g_SPUCacheHitIncr = cZero;
			NCache::g_SPUCacheHitIncr = spu_insert((uint32)1, NCache::g_SPUCacheHitIncr, 0);
			//init profiling data
			for(int i=0; i<MAX_PROF_ID * (4*3) / sizeof(vec_uint4); ++i)
				NCache::g_SPUCacheProfIDCounter[i] = (vec_uint4)0;
			NCache::g_SPUCacheCurProfID = (vec_uint4)0;
#endif

			NCache::g_CurAtomicEA				= 0;
#if defined(ENABLE_HAZARD_MODE)
			NCache::g_AtomicEAToStart		= 0; 
#endif

			g_SPUBubbleIndexMask0 = (vec_uint4)0;
			g_SPUBubbleIndexMask0 = spu_insert(1, g_SPUBubbleIndexMask0, 1); 
			g_SPUBubbleIndexMask0 = spu_insert(2, g_SPUBubbleIndexMask0, 2);
			g_SPUBubbleIndexMask0 = spu_insert(3, g_SPUBubbleIndexMask0, 3);

			g_SPUBubbleIndexMaskShl4 = (vec_uint4)0;
			g_SPUBubbleIndexMaskShl4 = spu_insert(16, g_SPUBubbleIndexMaskShl4, 1); 
			g_SPUBubbleIndexMaskShl4 = spu_insert(32, g_SPUBubbleIndexMaskShl4, 2);
			g_SPUBubbleIndexMaskShl4 = spu_insert(48, g_SPUBubbleIndexMaskShl4, 3);
			g_sZero = cZero;
		}

		__attribute__((always_inline))
		inline void SetupCache(const uint32 cCurTopLS)
		{
			const uint32 cTopLSALigned = AlignSize128(cCurTopLS);
			//set up cache
			//determine cache size: 4, 8, 16, 32 or 64 KB
			//each cache requires: size * 2(cache + shadow cache) + size / 8 (LRU and DIR) bytes
			const uint32 cRemSize = 255 * 1024 - g_sInfoBlock.GetMinStackSize() - (uint32)cTopLSALigned;
			const uint32 cMaxRequCacheSize = g_sInfoBlock.GetMaxCacheSize();
			spu_CheckCacheHazard(cRemSize >= TOTAL_CACHE_SIZE(4*1024));
			uint32 cacheSize = 
				cRemSize >= TOTAL_CACHE_SIZE(64*1024)?64*1024 :
				cRemSize >= TOTAL_CACHE_SIZE(32*1024)?32*1024 :
				cRemSize >= TOTAL_CACHE_SIZE(16*1024)?16*1024 : 
				cRemSize >= TOTAL_CACHE_SIZE(8*1024)?8*1024 : 4*1024;
			cacheSize = (cacheSize>cMaxRequCacheSize)?cMaxRequCacheSize : cacheSize;
#if defined(DO_SPU_PROFILING)
			g_PerfStats.cacheSize = (cacheSize >> 10);//in KB
#endif

			g_pSPUCache = (vec_uint4*)(void*)cTopLSALigned;//entire cache memory
			//cache copy for each line, contains original state for each bit in a cacheline to only write back 
			//	the bits which have changed (if any)
			//	costs considerable amount of memory (same as cache itself), but only way to ensure to only write back
			//		the changed bits and enables us to not mark anything as dirty
			NSPU::NCache::g_pSPUShadowCache	= (vec_uint4*)((uint8*)g_pSPUCache + cacheSize);
			//cache directory containing line addresses (1x uint32per cache line), 0 indicates no mapping
			g_pSPUCacheDir									= (vec_uint4*)((uint8*)NSPU::NCache::g_pSPUShadowCache + cacheSize);
			//cache LRU Control, 1x uint32 counter per cache line in each set
			g_pSPUCacheLRUCtrl							= (vec_uint4*)((uint8*)g_pSPUCacheDir + (cacheSize >> 5));
			//number of sets
			g_SPUNumSets										= (((cacheSize >> scSPUCacheLineSizeShift)) >> scSPUCacheSetNumWaysShift);
			
#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
			g_sProgramTopLS += TOTAL_CACHE_SIZE(cacheSize);
#endif 

			//initialize global cache constants
			g_SPUCacheLineOffValues = spu_splats((uint32)0);//create 384, 256, 128, 0
			g_SPUCacheLineOffValues = spu_insert(128, g_SPUCacheLineOffValues, 1); 
			g_SPUCacheLineOffValues = spu_insert(256, g_SPUCacheLineOffValues, 2);
			g_SPUCacheLineOffValues = spu_insert(384, g_SPUCacheLineOffValues, 3);
			g_SPUCacheLineOffValues = spu_add(g_SPUCacheLineOffValues, spu_splats((uint32)g_pSPUCache));

			g_SetMaskSL4	= (vec_uint4)((g_SPUNumSets-1) << 4);
		}

		__attribute__((always_inline))
		inline void IncrQueuePullPointer(uint32& rCurPullAddr, const uint32 cIncr, const uint32 cQueueStart, const uint32 cQueueEnd)
		{
			const uint32 cNextPull = rCurPullAddr + cIncr;
			rCurPullAddr = (cNextPull >= cQueueEnd)?cQueueStart : cNextPull;
		}

		//processes a fetched job til no further packets are found
		//packets have all the same size in terms of parameter data and list size
		__attribute__((always_inline))
		inline void ProcessJob(const uint32 cAvailMemStart, const uint32 cSPUID, const uint32 cMaxSPUs)
		{
#if defined(SUPP_SN)
			if(g_sInfoBlock.IsDriverDebugEnabled())
				__asm volatile ("stop 255");
			g_sDebugState = (uint32)g_sInfoBlock.IsDebugEnabled();
#endif
			//get the info packet, cDriverSize is a multiple of 16 (ensured by PPU)
			TAddrLS pCurAddr = (TAddrLS)cAvailMemStart;//128 byte aligned

#if defined(DO_SPU_PROFILING) || defined(SUPP_SPU_FRAME_STATS)
			//decrementer is reseted right before calling ProcessJob
	#if defined(DO_SPU_PROFILING) 
			g_PerfStats.Reset();
			const uint32 cJobStartTime = spu_readch(SPU_RdDec);
			g_PerfStats.driverSize = cAvailMemStart >> 4;
	#endif
#endif
			//setup of the cache and bubble memory, must come first since g_SPUBubbleMem remains constant
			const NPPU::EBubbleMode cBubbleMode = g_sInfoBlock.GetBubbleMode();
			const uint16 cFirstBubbleIndex = g_sInfoBlock.GetFirstBubbleIndex();
			SetupBubbles(pCurAddr, cBubbleMode, cFirstBubbleIndex);
			TAddrLS pJobEntry = pCurAddr;
			const uint32 cJobSize = g_sInfoBlock.jobSize << 2;
			const uint32 cJobProgramEA = g_sInfoBlock.eaDMAJobAddress;
			bool syncOnJob = false;
			//copy job only if not still present, useful since it it is by far the most time consuming setup job
			if(cJobProgramEA != GetLastProgramEA())
			{
				MemcpyLS(pJobEntry, cJobProgramEA, cJobSize, g_scDMAJobTag);//cannot be large
				SetLastProgramEA(cJobProgramEA);
				syncOnJob = true;
			}
			IncrementPointer(pCurAddr, cJobSize);
			
			//transfer bucket headers here, for __SPU__ we have a padding of 16 bytes
			MemcpyLS
			(
				(void*)NSPU::NDriver::g_sMemMan.GetBucketSPUAddr(),
				NSPU::NDriver::g_DestMemAreaEA,
				128, 
				g_scDMAPPUMemTag
			);
			ResetJobSpawnReg();
			uint32 queueAddress = 0;
			const bool cHasQueue = g_sInfoBlock.HasQueue();
			volatile uint8* pQueueBuffer = NULL;
			uint32 nextPacketEA = 0;
			IF(cHasQueue, false)
			{
				queueAddress	= g_sInfoBlock.GetQueue();
				assert((queueAddress & 127) == 0);//must be cache line aligned
				pCurAddr			= (TAddrLS)AlignSize128((uint32)pCurAddr);
				//transfer push/pull pointer here
				pQueueBuffer	= (volatile uint8*)pCurAddr;
				//transfer pull/push ptr of queue here
				MemcpyLS((volatile TAddrLS)pQueueBuffer, queueAddress, 128, g_scDMAListTag);
				IncrementPointer(pCurAddr, 128);
				nextPacketEA = SInfoBlock::scNoPacketVal;
			}
#if !defined(DO_SPU_PROFILING) && !defined(SUPP_SPU_FRAME_STATS)
			//do it now since this way we do not loose any performance
			spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif

#if defined(DO_SPU_PROFILING)
			g_PerfStats.jobSize = cJobSize >> 4;
			g_PerfStats.spuFetchTime = g_JobFetchTime;
#endif
			//now inform next SPU to change state from wait to polling if active(if not ID 4)
			const bool cStateNextSPUWaiting = ((spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 1) == STATE_NEXT_SPU_DEACTIVATED));
			IF(cStateNextSPUWaiting, true)
			{
				//send STATE_EVENT_POLLING signal to next SPU since we need 1 responsive SPU
				SendSignal2(STATE_EVENT_POLLING, spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 3));
				NSPU::NDriver::g_sEventStatNextSPU = spu_insert(STATE_NEXT_SPU_ACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);
			}

			uint8* __restrict pParamArea;
			int syncOnFlushCache;
			volatile uint32* pQueuePull;
			volatile uint32* pQueuePush;
			volatile uint32* pQueueJobState;
			uint32 curPullAddr, curPushAddr;
			uint32 queueIncr, queueStart, queueEnd;
			uint32 curTopLS = (uint32)pCurAddr;
			const uint32 cParamSize = (g_sInfoBlock.paramSize << 4);
			IF(!cHasQueue, true)
			{
				pParamArea				= g_sInfoBlock.GetParamAddress();//embedded
				syncOnFlushCache	= (g_sInfoBlock.GetExtJobStateAddress() == 0)?1:0;
			}
			else
			{
				//sync transfer of queue here
				syncOnFlushCache = 1;//flush cache must be syncd
#if defined(MEASURE_TIMEOUT)
				SyncMemory(g_scDMAListTag);			//sync transfer of next parameter packet
#else
				spu_writech(MFC_WrTagMask, (1<<g_scDMAListTag));
				spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
				spu_readch(MFC_RdTagStat);
#endif	//MEASURE_TIMEOUT
				//set push, pull and job state pointer
				pQueuePull			= (uint32*)&pQueueBuffer[NPPU::scProdConsPullOff];
				pQueuePush			= (uint32*)&pQueueBuffer[NPPU::scProdConsPushOff];
				pQueueJobState	= (uint32*)&pQueueBuffer[NPPU::scProdConsDMAOff];
				//get constant queue increment, start and end address
				queueIncr				= *(volatile uint32*)&pQueueBuffer[NPPU::scProdConsPullIncr];
				queueStart			= *(volatile uint32*)&pQueueBuffer[NPPU::scProdConsPullStart];
				queueEnd				= *(volatile uint32*)&pQueueBuffer[NPPU::scProdConsPullEnd];
				curPullAddr			= *pQueuePull;
				curPushAddr			= *pQueuePush;
				assert(curPullAddr != curPushAddr);
				//transfer first parameter packet here
				MemcpyLS(pCurAddr, curPullAddr, cParamSize, g_scDMAPPUMemTag);
				IncrQueuePullPointer(curPullAddr, queueIncr, queueStart, queueEnd);
				pParamArea = (uint8*)pCurAddr;
				curTopLS += cParamSize;
			}
			//definition of job function (takes pointer to transfered parameters as input param)
		#if defined(SUPP_SN)
			volatile void (*pFnctExecute)(void*) = (volatile void (*)(void*))((uint32)pJobEntry + sizeof(NBubBin::SJob) + sizeof(spu_mod_hdr));
		#else
			volatile void (*pFnctExecute)(void*) = (volatile void (*)(void*))((uint32)pJobEntry + sizeof(NBubBin::SJob));
		#endif

#if defined(DO_SPU_PROFILING)
			uint32 jobTicks = 0;
#endif

#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
			g_sProgramTopLS = curTopLS;
			assert(NSPU::g_sProgramTopLS <= (256-16) * 1024);
#endif

			NSPU::NDriver::g_sMemMan.Reset();//reset memory management

			bool disableJobAtEnd = true;
			bool firstParamLoop = true;
			bool firstLoopIt = true;
			bool paramLoop = true;
			uint32 addJobInvoc = 0;//count additional job invocations through prod./cons.queue

StartParameterLoop:
			SetupCache(curTopLS);	//allow diff.cache settings for each job
//------------------------------------------prod/cons queue packet loop begin------------------------------------------------
			do 
			{
				ResetCacheControl();
//------------------------------------------inner packet loop begin------------------------------------------------
				do 
				{
					const bool cSync = (firstLoopIt || cHasQueue);
					IF(cSync, true)
					{
	#if defined(MEASURE_TIMEOUT)
						SyncMemory(g_scDMAPPUMemTag);//transfer of bucket headers, only required once
	#else
						//faster version of syncing
						spu_writech(MFC_WrTagMask, (1<<g_scDMAPPUMemTag));
						spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
						spu_readch(MFC_RdTagStat);
	#endif	//MEASURE_TIMEOUT
					}
					IF(!cHasQueue, true)
					{
						const uint8* cNextParamArea			= pParamArea + cParamSize;
						const uint32 cEndParamInputArea	= (uint32)(&g_sInfoBlock + sizeof(SInfoBlock));
						nextPacketEA = *(uint32*)cNextParamArea;//address of next packet
						nextPacketEA = ((uint32)cNextParamArea < cEndParamInputArea)?nextPacketEA : SInfoBlock::scNoPacketVal;
						paramLoop = false;
					}

					//only sync code if it was not present before
					if(syncOnJob)
					{
						syncOnJob = false;
	#if defined(_DEBUG)
						SyncMemory(g_scDMAJobTag);
	#else
						const uint32 cTagMask = (1<<g_scDMAJobTag);
						spu_writech(MFC_WrTagMask, cTagMask);
						//use polling in case of profiling since the large job seemed to hang up the read to the decrementer
		#if !defined(DO_SPU_PROFILING)
						spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
						spu_readch(MFC_RdTagStat);
		#else			
						do {} WHILE(spu_mfcstat(MFC_TAG_UPDATE_IMMEDIATE) != cTagMask, false);
		#endif
	#endif //_DEBUG
						PatchJobEntry((NBubBin::SJob*)pJobEntry, (uint32)g_SPUBubbleMem); 
					}

					IF(!firstLoopIt, false)
					{
						//leave bubbles as they are but make sure the first one is present, patch again accordingly
						const bool cTransferring	= NCodePaging::SetActiveBubbles((int)cFirstBubbleIndex, -1, -1, -1);
						const vec_int4 cSplatID		= spu_splats((int)cFirstBubbleIndex);
						const int cFirstBubSlot		= GetBubbleSlot(cSplatID);
						PatchJobEntry((NBubBin::SJob*)pJobEntry, (uint32)(g_SPUBubbleMem + cFirstBubSlot * g_SPUBubbleSize));
						//sync first bubble if it was transferring
						IF(cTransferring, false)
						{
							g_SPUBubbleStates[cFirstBubSlot].curState = BUB_STATE_READY;
							SyncMemory(g_scDMABubbleTag0+cFirstBubSlot);
						}
					}
					else
						PrefetchBubbles((NBubBin::SJob*)pJobEntry, 0xFFFFFFFD);

	#if defined(DO_SPU_PROFILING)
					const uint32 cJobExecStartTime = spu_readch(SPU_RdDec);
	#endif

	#if defined(DO_SPU_PROFILING)
					g_PerfStats.stackSize = (((NSPU::GetStackAddress() - g_sProgramTopLS)) >> 10);//in KB
	#endif

	#if !defined(_NO_SPU_ASSERT)
					const unsigned int cStackBefore = NSPU::GetStackAddress();
	#endif

					pFnctExecute((void*)pParamArea);//execute job 

	#if !defined(_NO_SPU_ASSERT)
					const unsigned int cStackAfter = NSPU::GetStackAddress();
					assert(cStackAfter == cStackBefore);
	#endif

	#if defined(DO_SPU_PROFILING)
					const uint32 cJobExecEndTime = spu_readch(SPU_RdDec);
					jobTicks += cJobExecStartTime - cJobExecEndTime;
	#endif
					//first bubble must still be present at its position
					pParamArea += cHasQueue?0 : cParamSize;
					firstLoopIt = false;
					assert(IsBubblePresent(spu_splats((int)cFirstBubbleIndex)));
				}WHILE(nextPacketEA != SInfoBlock::scNoPacketVal, false);
//------------------------------------------inner packet loop end------------------------------------------------

				IF(cHasQueue, false)
				{
					//increment pull pointer and get next packet if pull != push
					paramLoop = curPullAddr != curPushAddr;
					if(paramLoop)
					{
						//transfer next parameter packet here
						MemcpyLS(pParamArea, curPullAddr, cParamSize, g_scDMAPPUMemTag);
						IncrQueuePullPointer(curPullAddr, queueIncr, queueStart, queueEnd);
					}
				}

				//flush cache	so that callbacks see all updated data reside on main memory as well
				//this is time critical, flushing is shortly faster than the callback wakeup on PPU
				//currently it is syncd to avoid any time critical issues
#if defined(SPU_CACHE_MISS_USE_ASM)
				FlushCacheComplete(syncOnFlushCache);
#else
				NSPU::CSPUMemMan::FlushCacheComplete(syncOnFlushCache, firstParamLoop);
#endif//SPU_CACHE_MISS_USE_ASM
				HandleJobSpawn();
	#if defined(DO_SPU_PROFILING)
				//do only for first parameter iteration, otherwise it depends on the num of packets processed
				if(firstParamLoop && g_sInfoBlock.eaJobPerfAddress != 0)
				{
					const uint32 cJobEndTime		= spu_readch(SPU_RdDec);
					//decrementer reading takes at least 20 cycles, 40 cycles per tick currently
					g_PerfStats.spuJobTime			= jobTicks - 1;
					g_PerfStats.spuSetupTime		= (uint16)((cJobStartTime - cJobEndTime - 1) - jobTicks);
					//cache stats written by SPUMemManager
					//transfer back
					MemcpyMain(g_sInfoBlock.eaJobPerfAddress, (TAddrLS)&g_PerfStats, sizeof(SJobPerfStats), g_scDMAOutputTag);
				}
	#endif
				firstParamLoop = false;
			}WHILE(paramLoop, false);
//------------------------------------------prod/cons queue packet loop end------------------------------------------------

			uint32 jobFinished = 0;
			IF(!cHasQueue, true)
				disableJobAtEnd = HandleCallback(jobFinished);
			else
			{
				//tricky part of queue synchronization
				//update loop, loop til we either successfully wrote the updated pointers/state back 
				//	or the push pointer has changed on PPU side
				int status;
				const uint32 cOldPush = curPushAddr;
#if !defined(_NO_SPU_ASSERT)
				volatile int counter = 0;
#endif
				mfc_getllar_prep(pQueueBuffer, queueAddress);
				do 
				{
					status = 1;
#if !defined(_NO_SPU_ASSERT)
					assert(counter++ < 10);
#endif
					//get lock for queue
					mfc_getllar_again();
					mfc_read_atomic_status();
					//check if push has been changed in the meantime
					curPushAddr = *pQueuePush;
					if(cOldPush != curPushAddr)//push pointer has been changed, init all for next parameter packet iteration
					{
						//pull pointer is only changed on SPU, so no need to fetch it again
						//transfer first parameter packet here
						MemcpyLS(pParamArea, curPullAddr, cParamSize, g_scDMAPPUMemTag);
						IncrQueuePullPointer(curPullAddr, queueIncr, queueStart, queueEnd);
						status = 0;
						++addJobInvoc;
						goto StartParameterLoop;//restart loop
					}
					//now we have to atomically update the pull pointer, and set the finished state
					*pQueuePull			= curPullAddr;
					*pQueueJobState = NPPU::scJobFinished;
					mfc_putllc_again();
					status = mfc_read_atomic_status();
				}while(status != 0);
			}

			//transfer bucket headers back, transfer any garbage in front of it (16 bytes) to copy 128 bytes and get peak performance
			NSPU::MemcpyMain
			(
				NSPU::NDriver::g_DestMemAreaEA,
				(void*)NSPU::NDriver::g_sMemMan.GetBucketSPUAddr(),
				128,
				g_scDMAOutputTag
			);

			//write via DMA a 0 to the job state address (the one tracking job slots, not the one informing waiters
			if(disableJobAtEnd)
				MemcpyMain(g_sInfoPacketAddr, (volatile TAddrLS)&g_sZero, 16, g_scDMAOutputTag);//located on the beginning of the info packet
#if defined(SUPP_SPU_FRAME_STATS)
			uint32 fullTickCount;
			int status;
	#if !defined(_NO_SPU_ASSERT)
			volatile int counter = 0;
	#endif
			//atomically update, get address of frame profile data for this particular job
			//copy 16 bytes to get better DMA throughput
			volatile vec_uint4 frameProfBuf;
			IF(jobFinished == 1, true)
				MemcpyLS(&frameProfBuf, (g_FrameProfileDataBase + (g_sInfoBlock.frameProfIndex << 2)) & ~15, 16, g_scDMAPPUMemTag);
			NPPU::SSingleSPUStat curStats _ALIGN(128);
			mfc_getllar_prep((void*)&curStats, g_DestStatsAreaEA);
			do 
			{
				status = 1;
#if !defined(_NO_SPU_ASSERT)
				assert(counter++ < 10);
#endif
				//get lock for queue
				mfc_getllar_again();
				mfc_read_atomic_status();
				const uint32 cCurDecrCnt		= spu_readch(SPU_RdDec);
				const uint32 cCurStatIndex	= (cSPUID >= curStats.curSPUPivot)?
					(cSPUID - curStats.curSPUPivot) : (cMaxSPUs - curStats.curSPUPivot + cSPUID);
				fullTickCount = (0xFFFFFFFFU - cCurDecrCnt);
				curStats.count[cCurStatIndex] += fullTickCount;
				mfc_putllc_again();
				status = mfc_read_atomic_status();
			}while(status != 0);
#endif
			IF(jobFinished == 1, true)
				TransferSPUJobFrameStats(&frameProfBuf, fullTickCount, addJobInvoc);
			ReleaseDebugging();
		}
	}//NDriver
}//NSPU

//dont rename, main is specified entry point by makefile
//main is relocated and cPacketInfo is set up by initial spu loader, up to 4 params are possible
//cPacketInfo[0] pull address for jobs
//cPacketInfo[1] destination area of memory with PPU communication
//cPacketInfo[2] SBubbleDirInfo address
//cPacketInfo[3] driver size (new memory is transferred straight behind it) (lower 24 bit, upper 8 bit SPU id)
int main(const NSPU::NDriver::SSpuParam cPacketInfo)
{
	NSPU::NDriver::InitCache();
	uint32 availMemStart = cPacketInfo.data[3] & (255 | (255 << 8) | (255 << 16))/*driver size*/;
	//cPacketInfo tells where PPU pushes to and we have to pull from the individual job queue
	NSPU::NDriver::g_sPullEA	= (uint32)cPacketInfo.data[0] & ~127;//pull is 128 byte aligned
	const uint32 cMaxSPUs			= (uint32)cPacketInfo.data[0] & 127;//packed into the lower 7 bits
	//packet sync is right before pull
	NSPU::NDriver::g_sSPUPacketSyncEA = NSPU::NDriver::g_sPullEA + NPPU::scJobInfoBlocksSPUPacketSyncAddressDiff; 
	NSPU::NDriver::g_sJobInfoBlockEA	= NSPU::NDriver::g_sPullEA + NPPU::scJobInfoBlocksPullAddressDiff;
#if defined(SUPP_SPU_FRAME_STATS)
	NSPU::NDriver::g_FrameProfileDataBase = (uint32)(NSPU::NDriver::g_sPullEA + NPPU::scJobInfoBlocksFrameProfDataSyncAddressDiff);
#endif
	NSPU::NDriver::ResetLastProcessData();

	NSPU::NDriver::g_sLockActive = spu_splats((uint32)0);

	const uint32 cSPUID = (cPacketInfo.data[3] & (255 << 24)) >> 24;

	//first SPU performs fast polling, others are activated on demand and poll with a lower rate
	const uint32 cIdleLoops = (cSPUID == 0)?FAST_POLLING_IDLE_LOOP_CNT : SLOW_POLLING_IDLE_LOOP_CNT;
	//init idle loops
	NSPU::NDriver::g_sLockActive = spu_insert(cIdleLoops, NSPU::NDriver::g_sLockActive, 1);

	#if !defined(_NO_SPU_ASSERT) || defined(DO_SPU_PROFILING)
		NSPU::g_sProgramTopLS = 0;
	#endif

	//only first SPU is allowed to poll initially
	NSPU::NDriver::g_sEventStatNextSPU = spu_insert
	(
		(cSPUID != 0)? STATE_EVENT_WAIT : STATE_EVENT_POLLING,
		NSPU::NDriver::g_sEventStatNextSPU,
		0
	);
	//initialize the signal status 
	uint32 nextSPUId = cSPUID + 1;		nextSPUId = (nextSPUId >= cMaxSPUs)?0 : nextSPUId;
	//in case we got only one 1 SPU, do not send signals at all
	NSPU::NDriver::g_sEventStatNextSPU = 
		spu_insert((cMaxSPUs == 0)?STATE_NEXT_SPU_NOT_AVAIL : 
		(nextSPUId == 0)?STATE_NEXT_SPU_ACTIVATED : STATE_NEXT_SPU_DEACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);
	//initialize next SPU ID
	NSPU::NDriver::g_sEventStatNextSPU = spu_insert(nextSPUId, NSPU::NDriver::g_sEventStatNextSPU, 3);
#if defined(SUPP_PRINTF)
	NSPU::NDriver::g_DestPrintfAreaEA = cPacketInfo.data[1] + NPPU::scPrintfBufDiff + cSPUID * SPU_PRINTF_BUF_SIZE;
#endif

	NSPU::NDriver::g_sJobQueuePullBack.lockObtained = 0;

#if defined(DO_SPU_PROFILING)
	NSPU::NDriver::g_DestProfAreaEA		= cPacketInfo.data[1] + NPPU::scProfBufDiff + cSPUID * MAX_PROF_ID * (4*3);
#endif
#if defined(SUPP_SPU_FRAME_STATS)
	NSPU::NDriver::g_DestStatsAreaEA	= cPacketInfo.data[1] + NPPU::scStatsDiff;
#endif

	NSPU::NDriver::g_DestMemAreaEA = cPacketInfo.data[1] + cSPUID * SIZEOF_SPPUMEMREQUESTDATA;
	
	NSPU::NDriver::g_sMemMan.Init(NSPU::NDriver::g_DestMemAreaEA);	//init memory management

	//init bubble directory
	const uint32 cBubbleInfoEA = cPacketInfo.data[2];
	NSPU::MemcpyLS(&NSPU::g_BubInfo, cBubbleInfoEA, sizeof(NSPU::SBubbleDirInfo), g_scDMABubbleTag0);	
	NSPU::SyncMemory(g_scDMABubbleTag0);
	assert((availMemStart & 0xF) == 0);//expect 16 byte alignment
	NSPU::g_GlobalSPUBubbleDir = (NSPU::SBubbleInfo*)availMemStart;
	const uint32 cBubbleDirSize = NSPU::AlignSize16(NSPU::g_BubInfo.bubbleNum * sizeof(NSPU::SBubbleInfo));
	availMemStart += cBubbleDirSize;
	NSPU::MemcpyLS(NSPU::g_GlobalSPUBubbleDir, NSPU::g_BubInfo.bubbleDirEA, cBubbleDirSize, g_scDMABubbleTag1);	
	assert(scMaxSPUBubbleCount == 4);
	NSPU::g_SPUBubbleStates[0].Reset();
	NSPU::g_SPUBubbleStates[1].Reset();
	NSPU::g_SPUBubbleStates[2].Reset();
	NSPU::g_SPUBubbleStates[3].Reset();
	ResetBubbleDir();
	NSPU::SyncMemory(g_scDMABubbleTag1);

	//jobs expect this kind of alignment
	availMemStart = NSPU::AlignSize128(availMemStart);
	NSPU::g_SPUBubbleMem = (uint8*)availMemStart;

	uint32 eaDepJob;
	//keep grabbing jobs from the job queue
	while (1)
	{
		if(spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 0) == STATE_EVENT_POLLING)
		{
			WHILE(!NSPU::NDriver::GetJobFromJobQueue(cSPUID), false)//accelerate case where it is true
			{
				//check if we are told to change state into wait
				if(!spu_extract(NSPU::NDriver::g_sLockActive, 0))
					IF(spu_readchcnt(SPU_RdSigNotify2), false)
						goto SkipProcessingJob;
				//do some idle loops to relieve the bus a bit
				volatile int i=0;
				const uint32 cCurIdleLoops = spu_extract(NSPU::NDriver::g_sLockActive, 1);
				for(volatile uint32 i=0; i<cCurIdleLoops; ) i = i+1;
			}
			do
			{
				//we got a job
#if defined(DO_SPU_PROFILING) 
				NSPU::NDriver::g_JobFetchTime = 0xFFFFFFFFU - spu_readch(SPU_RdDec);//time from job signal to here
#endif
#if defined(DO_SPU_PROFILING) || defined(SUPP_SPU_FRAME_STATS)
				spu_writech(SPU_WrDec, 0xFFFFFFFFU);//reset decrementer
#endif
				NSPU::NDriver::ProcessJob(availMemStart, cSPUID, cMaxSPUs);
				eaDepJob = ~0;
				IF(NSPU::NDriver::g_sInfoBlock.depJobIndex == NSPU::NDriver::SInfoBlock::scNoIndex, true)
					break;
				//we got a dependent job, pop the job entry
				eaDepJob = NSPU::NDriver::g_sJobInfoBlockEA + sizeof(NSPU::NDriver::SInfoBlock) * NSPU::NDriver::g_sInfoBlock.depJobIndex;
				NSPU::MemcpyLSNoDebug((NSPU::TAddrLS)&NSPU::NDriver::g_sInfoBlock, eaDepJob, sizeof(NSPU::NDriver::SInfoBlock), 3);
				NSPU::NDriver::g_sInfoPacketAddr = eaDepJob;
				NSPU::SyncMemory(3);
			}
			WHILE(eaDepJob != ~0, false);
		
			//we have finished processing the job, notify next spu that we are back idle
			const bool cStateNextSPUActive = (cMaxSPUs > 0) && ((spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 1) == STATE_NEXT_SPU_ACTIVATED));
			if(cStateNextSPUActive)
			{
				//send STATE_WAIT signal to next SPU
				NSPU::NDriver::SendSignal2(STATE_EVENT_WAIT, spu_extract(NSPU::NDriver::g_sEventStatNextSPU, 3));
				NSPU::NDriver::g_sEventStatNextSPU = spu_insert(STATE_NEXT_SPU_DEACTIVATED, NSPU::NDriver::g_sEventStatNextSPU, 1);//change state
			}
			//reset idle loops to fast polling
			NSPU::NDriver::g_sLockActive = spu_insert(FAST_POLLING_IDLE_LOOP_CNT, NSPU::NDriver::g_sLockActive, 1);
		}
		else
		{
SkipProcessingJob:
			//wait for signal event from previous SPU (can only happen for SPU ID 1..4)
			const uint32 cSignalRes = (uint32)spu_readch(SPU_RdSigNotify2);
			NSPU::NDriver::g_sEventStatNextSPU = spu_insert(cSignalRes, NSPU::NDriver::g_sEventStatNextSPU,	0);
		}
	}
}

#endif //__SPU__
#endif //PS3
