/*
	implementation of spu job manager
	DMA memory mappings can be issued in any order
*/

#if defined(PS3)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <platform.h>
#include <limits>
#include <sys/timer.h>
#include <sys/spu_initialize.h>	//for sys_spu_initialize
#include <sys/spu_utility.h>
#include <sys/ppu_thread.h>
#include <sys/spu_thread.h>
#include <sys/interrupt.h>
#include <sys/time_util.h>
#include <sys/sys_time.h>
#include <sys/paths.h>
#include <cell/sysmodule.h>
#include <cell/cell_fs.h>
#include <ppu_intrinsics.h>
#include <sys/tty.h>
#include <CryThread.h>
#include <sys/dbg.h>
#include <cell/gcm/gcm_macros.h>
#include <IJobManSPU.h>
#include "../SPU/SPU.h"
#include "PPU.h"
#include "JobManSPU.h"
#include "../SPU/Elf.h"
#include "SPUJobBase.h"


#define SPU_STATUS_STOPPED							0x0L
#define SPU_STATUS_RUNNING							0x1L
#define SPU_STATUS_STOPPED_BY_STOP			0x2L
#define SPU_STATUS_STOPPED_BY_HALT			0x4L
#define SPU_STATUS_WAITING_FOR_CHANNEL  0x8L
#define SPU_STATUS_SINGLE_STEP					0x10L

//even so the driver is loaded by the SPU itself, we need to fake the load to make the debugger load the symbols
#if defined(_DEBUG)
	#define SPU_IMAGE "/app_home/SPUDriver_debug.elf"
#else
	#if defined(DO_SPU_PROFILING)
		#define SPU_IMAGE "/app_home/SPUDriver_profile.elf"
	#else
		#define SPU_IMAGE "/app_home/SPUDriver.elf"
	#endif
#endif

#include "../SPU/CodePage/SPUPageLayout.h"

#include <spu_printf.h>

#define MAX_ITER 40000000

extern int SPURepository[];

#if defined(SUPP_SN)
	uint32 g_ForceStopSPUs = 0;//set to 1 to force stoppage of SPUs
#endif

//external symbol for the char array of code
//binary entry symbol main will be renamed to SPUDriver
//code is added to the ppu object code via ppu-lv2-objcopy
extern char SPUDriver[];
NSPU::SLoaderParams NPPU::CJobManSPU::scSPULoaderParam _ALIGN(4096);
uint32 NPPU::CJobManSPU::scInitalSPULoader[NSPU::scLoaderTextSizeBytes >> 2] _ALIGN(128) = SPU_LDR_TEXT;

namespace
{
	static const float scMaxJobTimeout = 1000000.f;//1 second
	inline const bool JobTimeOutElapsed(const uint64_t cStartTime)
	{
		static const uint64_t scMaxPeriod = (uint64_t)(scMaxJobTimeout * (float)(sys_time_get_timebase_frequency() / (1000 * 1000)));
		uint64_t curTime;
		NPPU::GetTimeTB(curTime);
		return ((curTime -  cStartTime) > scMaxPeriod);
	}
}

using namespace NPPU;

CJobManSPU* CJobManSPU::Instance()
{
	static CJobManSPU inst(SPUDriver);
	return &inst;
}

ILog* GetILog()
{
	return CJobManSPU::Instance()->GetLog();
}

void CJobManSPU::SetLog(ILog *pLog)
{
	m_pLog = pLog;
}
/*
void CJobManSPU::SyncMFCCmd
(
	const uint32 cSPUId, 
	const uint32 cDMATag
) const
{
	const uint32 cTagMask = 0x1 << cDMATag;
	sys_raw_spu_mmio_write(cSPUId, Prxy_QueryMask, cTagMask);
	do 
	{
		Eieio();
	} 
	while((sys_raw_spu_mmio_read(cSPUId, Prxy_TagStatus) & cTagMask) == 0);
}
*/
void CJobManSPU::SendMFCCmd
(
	const uint32 cSPUId, 
	const uint32 cLS, 
	const uint32 cEA, 
	const uint32 cSize, 
	const uint32 cDMATag, 
	const uint32 cCommand
) const
{
	//		const uint32 cEAh = (uint32)((uint64)(cEA)>>32);
	const uint32 cEAh = 0;
	const uint32 cEAl = (uint32)cEA;
	//set the DMA parameters to appropriate registers via MMIO, check the DMA command status by reading the DMACMDStatus register
	//if the status is 0, the DMA command enqueue has succeeded, otherwise, the DMA parameter must be set again
	uint32 dmaStat = 1;
	WriteSPUProbReg(cSPUId, scPCDMALSA, cLS);
	WriteSPUProbReg(cSPUId, scPCDMAEAH, cEAh);
	WriteSPUProbReg(cSPUId, scPCDMAEAL, cEAl);
	WriteSPUProbReg(cSPUId, scPCDMASizeTag, (cSize << 16) | cDMATag);
	do 
	{
		WriteSPUProbReg(cSPUId, scPCDMAClassCMD, cCommand);
		dmaStat = ReadSPUProbReg(cSPUId, scPCDMACMDStatus);
	}while(dmaStat != 0);
}

CJobManSPU::CJobManSPU(void* __restrict pDriver) 
	: m_pElfInfo(NULL), m_NumSPUAllowed(scMaxSPU), m_DriverSize(0), 
		m_Initialized(false), m_pLastAddedInfoBlock(NULL), m_pLog(NULL),
		m_JobNum(0),m_SpursInitialized(0),m_bEnablePrintf(true)
{
	m_pSPURep		= NULL;
	m_pPageDir	= NULL;
	m_RealCurPushAddress = ~0;
	m_PageInfo.gcmCmdResetOffset = -1;
#if 0
//#if !defined(SUPP_SN)
	const NSPU::NElf::EParseResult cRes = NSPU::NElf::ParseElf(pDriver, m_pElfInfo);		//parse spu-elf
	switch(cRes)
	{
		case NSPU::NElf::ePR_NoElf:				
			PrintOut("ERROR: SPU Driver is no valid ELF-file\n");
			break;
		case NSPU::NElf::ePR_NoSPUElf:		
			PrintOut("ERROR: SPU Driver is an ELF-file but no valid SPU-ELF-file\n");
			break;
		case NSPU::NElf::ePR_ElfTooBig:		
			PrintOut("ERROR: SPU Driver is too big to be uploaded\n");
			break;
		case NSPU::NElf::ePR_NoQWAddress:		
			PrintOut("ERROR: SPU Driver image start is not on a quadword address\n");
			break;
		case NSPU::NElf::ePR_Success:
		default:
			break;
	}
	if(cRes != NSPU::NElf::ePR_Success)
	{
		m_NumSPUAllowed = 0;//make sure its not getting uploaded
		PrintOut("ERROR: SPU Driver elf could not been parsed\n");
		exit(1);
	}

	m_DriverSize = NSPU::AlignSize16(m_pElfInfo->LSDestination + m_pElfInfo->imageSize);
#else
	m_pElfInfo = (spu_mod_hdr*)pDriver;
	m_DriverSize = NSPU::AlignSize16(m_pElfInfo->pad);
#endif

	InitLock();

#if defined(SUPP_SN)
	m_SPUJobDebugHandle					= (void*)0;
	m_SPUDriverDebuggingEnabled = false;
	m_DebuggingActive						= false;
	m_FrameDebuggingActive			= false;
#endif

#if defined(SUPP_SPU_FRAME_STATS)
	m_CurFrameBufIndex = 0;
#endif

#if defined(DO_SPU_PROFILING)
	m_pProfStatControl = NULL;
#endif

	m_AllocatedMemory = 0;
	m_GcmAddressBase	= 0;
	m_CellGcmCurrentContext = 0;
	m_FreeFunc = 0;
	m_MallocFunc = 0;

#if defined(SUPP_SPU_FRAME_STATS)
	m_pFrameProfileData = NULL;
#endif

	for(uint32 i=0; i<NPPU::scMaxSPU; ++i)
		m_pBucketDirMem[i] = NULL;

	m_pFuncProfilingArea = NULL;
}

const bool CJobManSPU::WaitSPUJob(volatile NSPU::NDriver::SExtJobState& rJobState, const int cTimeOutMS) const
{
	const uint32 cTimeOut		= (cTimeOutMS == -1)?1024 : (uint32)cTimeOutMS;//1 sec default timeout
	uint64 curTime					= rdtsc();
	uint64 timeOutTime			= curTime	 + (80*cTimeOut*1024);
	uint64 db16TimeOutTime	= curTime	 + (16*1024);//0.2 ms
#if defined(SUPP_SN)
	int debuggingEnabled = 0;
#endif
	while(rJobState.IsRunning())
	{
#if defined(SUPP_SN)
		if(!debuggingEnabled && IsDebuggingActive())
			debuggingEnabled = 1;
#endif
		curTime	= rdtsc();
		if(curTime > timeOutTime)
		{
	#if defined(SUPP_SN)
			if(g_ForceStopSPUs && !debuggingEnabled && cTimeOutMS != -1)
			{
	#endif
				if(m_pLog)
					m_pLog->LogError("Timeout in WaitSPUJob (waiting for job state)\n");
				else
					PrintOut("\nTimeout in WaitSPUJob (waiting for job state)\n");
	#if defined(SUPP_SN)
				if(0 == VerifySPUs())
				{					
					PrintOut("Stopped all running SPUs(index: job):");
					for(uint32 i=0; i<m_NumSPUAllowed; ++i)
					{
						if(!IsSPUNonThreadedProcessing(i))
							continue;
						PrintOut("  %d: %s",i,RetrieveCurrentSPUJob(i));
						WriteSPUProbReg(i, scPCSPURunCntl, 0);
						snRawSPUNotifySPUStopped(i);
					}
					PrintOut("\n");
				}
				snPause();
				while(rJobState.IsRunning()){}
				PrintOut("Continuing all SPUs\n");
				for(uint32 i=0; i<m_NumSPUAllowed; ++i)
				{
					WriteSPUProbReg(i, scPCSPURunCntl, 1);
					snRawSPUNotifySPUStarted(i);
				}
				return true;
			}
	#else
			return false;
	#endif
		}
		if(curTime > db16TimeOutTime)
			SleepYield(0);
		else
		for(uint32 i=0; i<32; ++i)
		{
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
		}
	}
	return true;
}

void CJobManSPU::PrintPerfStats(const volatile NSPU::NDriver::SJobPerfStats* pPerfStats, const char* cpJobName) const
{
#if defined(DO_SPU_PROFILING)
	if(pPerfStats && m_pProfStatControl && (*m_pProfStatControl==1) && (float)pPerfStats->spuSetupTime > 0.f/*not executed otherwise*/)
	{
		const float cFrequFactor	= 1000000.f / (float)NPPU::GetTimeBaseFrequency();
		const float cCycleFactor = (float)NPPU::GetCPUFrequency() / (float)NPPU::GetTimeBaseFrequency();
		PrintOut("\n\n------------------Begin JobStatistics for \"%s\"------------------\n\n",cpJobName);
		PrintOut("SPU driver time:       %.2f usec\n", (float)pPerfStats->spuSetupTime * cFrequFactor);
		PrintOut("SPU job fetch time:    %.2f usec\n", (float)pPerfStats->spuFetchTime * cFrequFactor);
		const float cUsecsJobTime = (float)pPerfStats->spuJobTime * cFrequFactor;
		if(cUsecsJobTime > 1000)
			PrintOut("SPU job time:          %.2f ms\n", cUsecsJobTime * 0.001f);
		else
			PrintOut("SPU job time:          %.2f usec\n", cUsecsJobTime);
		const float cUsecsMissTime = (float)pPerfStats->spuCacheMissTime * cFrequFactor;
		if(cUsecsMissTime > 1000)
			PrintOut("SPU cache miss time:   %.2f ms\n", cUsecsMissTime * 0.001f);
		else
			PrintOut("SPU cache miss time:   %.2f usec\n", cUsecsMissTime);
		PrintOut("\nSPU cache hits:        %d\n",pPerfStats->cacheHits);
		PrintOut("SPU cache misses:      %d\n",pPerfStats->cacheMisses);
		PrintOut("SPU cache flush: write backs sync:  %d\n",pPerfStats->cacheWritesBackSync);
		PrintOut("SPU cache flush: write backs async: %d\n",pPerfStats->cacheWritesBackASync);
		PrintOut("SPU cache flush: no write back:     %d\n",pPerfStats->cacheFlushsNoWrite);
		PrintOut("SPU lost lines (write back):    %d\n",pPerfStats->lostLineEvents);
		PrintOut("SPU prefetch cache hits:        %d\n",pPerfStats->prefetchHits);
		PrintOut("SPU prefetch cache misses:      %d\n",pPerfStats->prefetchMisses);
		PrintOut("SPU cache mem transferred to LS:        %d KB\n",pPerfStats->memTransToLS >> 10);
		PrintOut("SPU cache mem transferred back from LS: %d KB\n",pPerfStats->memTransFromLS >> 10);
		PrintOut("\nSPU driverSize size:      %d bytes\n",pPerfStats->driverSize << 4);
		PrintOut("SPU job size:             %d bytes\n",pPerfStats->jobSize << 4);
		PrintOut("SPU 1st page size:        %d bytes\n",pPerfStats->firstPageSize << 4);
		PrintOut("SPU job avail.stack size: %d Kbytes\n",pPerfStats->stackSize);
		PrintOut("SPU job cache size:       %d Kbytes\n",pPerfStats->cacheSize);
		PrintOut("\nSPU allocations (non bucket system):     %d\n", pPerfStats->allocsNoBucket);
		PrintOut("SPU memory releases (non bucket system): %d\n", pPerfStats->freeCountNoBucket);
		PrintOut("SPU allocations (bucket system):         %d\n", pPerfStats->allocsBucket);
		PrintOut("SPU memory releases (bucket system):     %d\n", pPerfStats->freeCount);
		PrintOut("SPU memory releases history matches (bucket system): %d\n", pPerfStats->freeCountHistory);
		PrintOut("SPU allocation size:                  %d\n", pPerfStats->allocSize);
		PrintOut("SPU effective bucket allocation size: %d\n", pPerfStats->totalBucketAllocSize);
		PrintOut("SPU func.ptr hits:        %d\n",pPerfStats->funcPtrHits);
		PrintOut("SPU func.ptr misses:      %d\n",pPerfStats->funcPtrMisses);
		PrintOut("SPU page mem transferred: %d Kbytes\n",pPerfStats->pageMemTransferred >> 10);
		PrintOut("SPU return miss calls:    %d\n",pPerfStats->returnMissHandlerCalls);
		PrintOut("SPU return page misses:   %d\n",pPerfStats->pageMissesRetMissHandler);
		PrintOut("SPU call miss calls:      %d\n",pPerfStats->callMissHandlerCalls);
		PrintOut("SPU page misses:          %d\n",pPerfStats->pageMisses);
		PrintOut("SPU pages transferred:    %d\n",pPerfStats->pagesTransferred);
		PrintOut("\n-------------------End JobStatistics-------------------\n\n");
	}
#endif
}

void CJobManSPU::ExecPPUCall(const uint32 cArg, const unsigned int cOpCode)
{
	switch (cOpCode)
	{
	case eEOC_CondNotify:
		{
			CryCondBase< CryLockT<CRYLOCK_FAST> > *const pCondBase = (CryCondBase<CryLockT<CRYLOCK_FAST> > *)(UINT_PTR)(cArg);
			pCondBase->PPUNotify();
		}
		break;
#if eEOC_CondNotifySingle != eEOC_CondNotify
	case eEOC_CondNotifySingle:
		{
			CryCondBase<CryLockT<CRYLOCK_FAST> > *const pCondBase = (CryCondBase<CryLockT<CRYLOCK_FAST> > *)(UINT_PTR)(cArg);
			pCondBase->PPUNotifySingle();
		}
		break;
#endif
	case eEOC_CondDestroy:
		{
			CryCondBase<CryLockT<CRYLOCK_FAST> > *const pCondBase = (CryCondBase<CryLockT<CRYLOCK_FAST> > *)(UINT_PTR)(cArg);
			pCondBase->PPUDestroy();
		}
		break;
	}
}

bool CJobManSPU::IsSPUProcessing(const unsigned int cSPUIIndex) const
{
#if defined(SUPP_SN)
	//spu_mod_hdr->pad holds the current processing state(first 32 bytes in non PIC SPUDriver image)
	const uint32 cState = ReadSPULS(cSPUIIndex, 28);
	return (SPUWaitState != cState && SPUPollState != cState);
#else
	return false;
#endif
}

const char* CJobManSPU::RetrieveCurrentSPUJob(const uint32 cSPUId)const
{
	NSPU::SJobStorageInfo jobStorage;
	const uint32 cLSAddr = m_PageInfo.jobStorageLS;
	jobStorage.binJobAddrEA			= ReadSPULS(cSPUId, cLSAddr);
//	jobStorage.lsStartJob				= ReadSPULS(cSPUId, cLSAddr+4);
//	jobStorage.lsStartPageMem		= ReadSPULS(cSPUId, cLSAddr+8);
//	jobStorage.pageSize					= ReadSPULS(cSPUId, cLSAddr+12);
	static const char *scpUnknownJob = "UNKNOWN_JOB";
	if(jobStorage.binJobAddrEA == 0)
		return (const char*)scpUnknownJob;
	std::map<uint32,uint32>::const_iterator it = m_JobNameMap.find(jobStorage.binJobAddrEA);
	if(it == m_JobNameMap.end())
		return (const char*)scpUnknownJob;
	return (const char*)it->second;
}

void CJobManSPU::HandleSpuInterrupt(uint64_t spuId)
{	
	const sys_raw_spu_t cId = spuId;

#if defined(DO_SPU_PROFILING)
	static uint32 sProfFileCntr = 0;
#endif

#if defined(SUPP_SN)
	snRawSPULockHandler();
#endif
	uint64_t stat;
	uint32 mail;
	int resetRet = CELL_OK;
	int ret = sys_raw_spu_get_int_stat(cId, 2, &stat);	//create a tag to handle class 2 interrupt, because PPU Interrupt MB
	if(CELL_OK != ret)
	{
//		if(GetILog())
//			GetILog()->LogError("sys_raw_spu_get_int_stat is failed %d\n", ret);
		PrintOut("sys_raw_spu_get_int_stat is failed %d\n", ret);
#if defined(SUPP_SN)
		snRawSPUUnlockHandler();
#endif
		sys_interrupt_thread_eoi();
	}
	//if the caught class 2 interrupt includes mailbox interrupt, handle it
	if((stat & scPCIntStatMailbox) == scPCIntStatMailbox)
	{
		ret = sys_raw_spu_read_puint_mb(cId, &mail);
		if(CELL_OK != ret)
		{
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_read_puint_mb is failed %d\n", ret);
			PrintOut("sys_raw_spu_read_puint_mb is failed %d\n", ret);
			sys_interrupt_thread_eoi();
		}
		//extra treatment when most upper bit is set
		if(mail & EVENT_COND)
		{
			//all addresses are relative to this one
			const uint32 cAddr = (mail & EVENT_ADRESS_MASK) << EVENT_ADRESS_SHIFT;
			const unsigned int cOpCode	= (unsigned int)(mail & EVENT_OPCODE_MASK);
			//reset the PU interrupt mailbox interrupt status bit
			resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
			CJobManSPU::Instance()->ExecPPUCall(cAddr, cOpCode);
//			if(NSPU::scEventBlocking[(int)cOpCode])
//				WriteSPUProbReg(cId, scPCSigNotify1, 1);
		}
		else
		{
			const uint32 cEventID = (mail >> EVENT_PORT_SHIFT);
			switch(cEventID)
			{
			case EVENT_PRINTF_PORT:
				{	
					//spu_sprintf request, address is stored in the non interrupt mailbox
					char buffer[1024];
					uint32 lsAddr = ReadSPUProbReg(cId, scPCPPUMB);
					//get format string and variable values from local store and to print on a console
					spu_raw_snprintf(buffer, sizeof(buffer), GetSPULSBaseAddr(cId), lsAddr);
					//check if it was an failed assertion, exit due to halted SPU
					static char bufferP[1024];
					sprintf(bufferP, "SPU%d(%s): %s", cId, CJobManSPU::Instance()->RetrieveCurrentSPUJob(cId), buffer);
					const int cSysRet = PrintOut(bufferP);
					if( CJobManSPU::Instance()->IsPrintfEnabled())
					{
						sys_tty_write(SYS_TTYP4, bufferP, strlen(bufferP), NULL);
					}
	//				if(GetILog())
	//					GetILog()->Log(bufferP);
					//reset the PU interrupt mailbox interrupt status bit
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					WriteSPUProbReg(cId, scPCSPUMB, cSysRet);//SPUs printf is expecting a return value
	#if !defined(SUPP_SN)
					if(memcmp(SPU_ASSERT_STRING, buffer, strlen(SPU_ASSERT_STRING)) == 0)
						abort();
	#endif
					break;
				}
				case EVENT_CUSTOM_CALLBACK_PORT:
				{
					CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
					const NPPU::SCallback crCallback = rMemAreaMan.GetCallbackData(cId);
					crCallback.pCallbackFnct(crCallback.pArg);
					WriteSPUProbReg(cId, scPCSigNotify1, 1);//post finished
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
				case EVENT_PRINTF_PORT_CUSTOM:
				{
					//reset the PU interrupt mailbox interrupt status bit
					CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
					const char* cpBuf = rMemAreaMan.HandlePrintfRequest(spuId, CJobManSPU::Instance()->RetrieveCurrentSPUJob(spuId));
	//				if(cpBuf && GetILog())
	//					GetILog()->Log("SPU%d: %s",(int)spuId,cpBuf);
					WriteSPUProbReg(cId, scPCSigNotify1, 1);//printf is expecting a value
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
			case EVENT_CALLBACK_PORT:
				{	//callback request, index is stored in the lower 16 bit, also set job state to not running
					const uint8 cCallbackIndex = (mail & 0xFF);
	#if defined(SUPP_SN)
					if(cCallbackIndex == NPPU::scDebugCallbackPort)
					{
						//release debugging
						CJobManSPU::Instance()->SetDebuggingActive(false);	//inform job manager
						break;
					}
	#endif
					static NPPU::SQueueNodeSPU& rJobQueue	= CJobManSPU::Instance()->GetJobQueue();
					NPPU::SJobData& rJobData							= rJobQueue.jobData[cCallbackIndex];
	//				while(rJobQueue.jobInfoBlocks[cCallbackIndex].jobState.IsRunning()){}//wait til all data has been transferred (time critical with flushing cache)
					//call callback directly
					assert(rJobData.callbackData.pCallbackFnct);
					rJobData.callbackData.pCallbackFnct(rJobData.callbackData.pArg);
					rJobQueue.jobInfoBlocks[cCallbackIndex].jobState.running = 0;//mark job as finished
	#if defined(SUPP_SN)
//					}
	#endif
					//reset the PU interrupt mailbox interrupt status bit
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
			case EVENT_MEMCHANGE_PORT:
				{
					CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
					rMemAreaMan.HandleMemRequest(spuId);
					//reset the PU interrupt mailbox interrupt status bit
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
			case EVENT_MEM_CLEANUP:
				{
					CJobManSPU *const __restrict pJobManSPU = CJobManSPU::Instance();
					pJobManSPU->CJobManSPU::UpdateSPUMemManSingleSPU(cId, true);
					WriteSPUProbReg(spuId, scPCSigNotify1, 0);
					//reset the PU interrupt mailbox interrupt status bit
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
	#if defined(DO_SPU_PROFILING)
			case EVENT_PROF_PORT:
				{
					const uint8 cJobId = (mail & 0xFF);
					CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
					rMemAreaMan.HandleProfRequest(spuId, sProfFileCntr++, cJobId);
					WriteSPUProbReg(cId, scPCSigNotify1, 1);//printf is expecting a value
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
	#endif
			default:
				{
	//				if(GetILog())
	//					GetILog()->LogError("caught unknown interrupt: %d\n",cEventID);
					PrintOut("caught unknown interrupt: %d\n",cEventID);
					resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
					break;
				}
			}
		}
		if(CELL_OK != resetRet)
		{
			static int messagePosted = 0;
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_set_int_stat is failed: %d\n", resetRet);
			if(!messagePosted)
			{
				PrintOut("sys_raw_spu_set_int_stat is failed: %d\n", resetRet);
				messagePosted = 1;
			}
#if defined(SUPP_SN)
			snRawSPUUnlockHandler();
#endif
			sys_interrupt_thread_eoi();
		}
	}
	else
#if defined(SUPP_SN)
	if(stat & scPCIntStatStopSignal)	//stop
	{
		//get stop signal
		NPPU::SSpuStatusRegister status;
		status.val = *(volatile uint32_t*)get_reg_addr(cId, scPCSPUStatus);
		switch(status.sc)
		{
		case 0x3:
			*(volatile uint32_t*)get_reg_addr(spuId, scPCSPURunCntl) = 0x1;			//restart the SPU
			break;

		case 254:
			{	//was custom stop instruction with branch register dest set, move PC to branch target
				//261616: upper 512 bytes reused for arrays, use everything below that
				const uint32 cBranchDest = ReadSPULS(spuId, (261616));
				WriteSPUProbReg(spuId, scPCSPUNPC, cBranchDest+4);//make PC point to branch target
				snRawSPUNotifySPUStopped(spuId);
#if defined(SUPP_SN)
				CJobManSPU::Instance()->SetDebuggingActive(true);	//inform job manager
#endif
				break;
			}

		case 255:
			{	//was custom stop instruction without register, move PC to next instruction
				const uint32 cCurPC = ReadSPUProbReg(spuId, scPCSPUNPC);
				WriteSPUProbReg(spuId, scPCSPUNPC, cCurPC+4);//make PC point to next instruction
				snRawSPUNotifySPUStopped(spuId);
#if defined(SUPP_SN)
				CJobManSPU::Instance()->SetDebuggingActive(true);	//inform job manager
#endif
				break;
			}
		default:
			snRawSPUNotifySPUStopped(spuId);
			break;
		}
		resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatStopSignal);
	}
	else 
	if(stat & scPCIntStatHaltSignal)	//halt
	{
		snRawSPUNotifySPUStopped(spuId);
		resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatHaltSignal);
	}
	else
#endif
	{
//		if(GetILog())
//			GetILog()->LogError("unknown interrupt stat: %ld\n", (long int)stat);
		PrintOut("Unknown Interrupt. stat: %ld\n", (long int)stat);
		//reset interrupt status bit of those not handled
		ret = sys_raw_spu_set_int_stat(cId, 2, stat);
		if(CELL_OK != ret)
		{
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_set_int_stat failed: %d\n", ret);
			PrintOut("sys_raw_spu_set_int_stat failed: %d\n", ret);
#if defined(SUPP_SN)
			snRawSPUUnlockHandler();
#endif
		}
	}
#if defined(SUPP_SN)
	snRawSPUUnlockHandler();
#endif
	__lwsync();
	sys_interrupt_thread_eoi();
}

const bool CJobManSPU::CreateRawSpuIntrHandler(const sys_raw_spu_t cSPUId, const uint32 cIndex)
{
	//create an interrupt handler and establish it on an interrupt PPU thread 
	//this PPU interrupt thread is going to handle interrupt mailbox events such as SPUs printf
	sys_ppu_thread_t handler;
	char threadNameBuf[64];
	sprintf(threadNameBuf, "Interrupt thread SPU %d", (uint32)cSPUId);
	int ret = sys_ppu_thread_create(&handler, HandleSpuInterrupt, cSPUId, 400, scPCPPUStackSize, SYS_PPU_THREAD_CREATE_INTERRUPT, threadNameBuf);
	if(CELL_OK != ret)
		PrintOut("sys_ppu_thread_create is failed %d\n", ret);
	ret = sys_raw_spu_create_interrupt_tag(cSPUId, 2, SYS_HW_THREAD_ANY, &m_SpuIntTags[cIndex]);
	if(CELL_OK != ret)
	{
		PrintOut("CJobManSPU::CreateRawSpuIntrHandler: sys_raw_spu_create_intr_tag() failed (returned %d)",ret);
		return false;
	}
	ret = sys_interrupt_thread_establish(&m_SpuIntHandle[cIndex], m_SpuIntTags[cIndex], handler, cSPUId);
	if(CELL_OK != ret)
	{
		PrintOut("CJobManSPU::CreateRawSpuIntrHandler: sys_intr_thread_establish() failed (returned %d)",ret);
		return false;
	}
#if defined(SUPP_SN)
	ret = sys_raw_spu_set_int_mask(cSPUId, 2, 7);
#else
	ret = sys_raw_spu_set_int_mask(cSPUId, 2, 1);
#endif
	if(CELL_OK != ret)	//set interrupt mask, the third argument = 1 enables PPU Mailbox interrupts
	{
		PrintOut("CJobManSPU::CreateRawSpuIntrHandler: raw_spu_set_int_mask() failed: %d", ret);
		return false;
	}
/*	ret = sys_raw_spu_set_int_stat(cSPUId, 2, ~0x0);
	if(CELL_OK != ret)
	{
		printf("CJobManSPU::CreateRawSpuIntrHandler: raw_spu_set_int_stat() failed (returned %d)",ret);
		return false;
	}
*/
	return true;
}

void CJobManSPU::LoadSPULoaderDriver
(
	const uint32 cRealSPUId,
	const uint32 cSPUIndex,
	const bool cIsRecreate
) const
{
#if 1
//#if defined(SUPP_SN)
	if(!cIsRecreate)
	{
		uint32_t entry;
		int ret = sys_raw_spu_load(cSPUIndex, SPU_IMAGE, &entry);
		if(ret == SUCCEEDED)
			snRawSPUNotifyElfLoad(cSPUIndex, entry, SPU_IMAGE);
	}
	//parameters that will be used by spu loader to load spu driver
	scSPULoaderParam.bin.imageSize  = m_DriverSize;
//	scSPULoaderParam.bin.imageSize  = 0;
	scSPULoaderParam.bin.imageEAlow = (uint32)m_pElfInfo;
	scSPULoaderParam.bin.elfEAlow		= (uint32)m_pElfInfo;
	scSPULoaderParam.bin.destLS     = 0;
	scSPULoaderParam.bin.entry      = m_pElfInfo->entry;
#else
	scSPULoaderParam.bin.imageSize  = m_pElfInfo->imageSize;
	scSPULoaderParam.bin.imageEAlow = m_pElfInfo->spuImageEA.ui[1];
	scSPULoaderParam.bin.elfEAlow		= m_pElfInfo->spuElfEA.ui[1];
	scSPULoaderParam.bin.destLS     = m_pElfInfo->LSDestination;
	scSPULoaderParam.bin.entry      = m_pElfInfo->entry;
#endif

	scSPULoaderParam.bin.imageEAhi  = 0;
	scSPULoaderParam.bin.lsLimit    = NSPU::scLSSize;
	scSPULoaderParam.bin.elfEAhi		= 0;

	//parameters that will be passed to spu main() function
	scSPULoaderParam.thr.gpr3[0]		= (uint32)&m_SPUJobQueue.pull | m_NumSPUAllowed;
	scSPULoaderParam.thr.gpr3[1]		= (uint32)m_MemAreaMan.GetSPUMemArea(0);//id is used to calc correct offset on SPU
	scSPULoaderParam.thr.gpr3[2]		= (uint32)&m_PageInfo;

	scSPULoaderParam.thr.gpr3[3]		= m_DriverSize | (cSPUIndex << 24);
	//copy parameters to LS offset SPU_LDR_PARAMS_start. 
	SendMFCCmd(cRealSPUId, (const uint32)NSPU::scLoaderParamsStart, (const uint32)&scSPULoaderParam, sizeof(scSPULoaderParam), 0, scPCMFCGetCMD);
	//set the SPUs program counter
	WriteSPUProbReg(cRealSPUId, scPCSPUNPC, NSPU::scLoaderProgramStart);
#if defined(SUPP_SN)
	if(!cIsRecreate)
		snRawSPUNotifySPUStarted(cSPUIndex);
#endif
	//copy loader to local store and start SPU execution (s bit set in MFC Get command)
	SendMFCCmd(cRealSPUId, (uint32)NSPU::scLoaderProgramStart, (uint32)&scInitalSPULoader, sizeof(scInitalSPULoader), 0, scPCMFCGetFSCMD);
}

void CJobManSPU::CreatePageJobStringTable()
{
	NPageBin::SJobStringHandle *pJobStringHandleRep = (NPageBin::SJobStringHandle*)((uint8*)m_pSPURep + m_pSPURep->jobStringOff);
	const uint32 cPageRepAddr = (uint32)m_pSPURep;
	const uint32 cJobNum = m_pSPURep->jobNum;
	m_JobNum = cJobNum;
	m_JobStringOffsets.resize(cJobNum);
#if defined(SUPP_SPU_FRAME_STATS)
	for(uint32 i=0; i<JOB_FRAME_STATS_BUFS;++i)
		m_CurFrameProfVec[i].resize(cJobNum);
	m_pFrameProfileData = new NPPU::SFrameProfileData[JOB_FRAME_STATS_BUFS*cJobNum];
#endif
	for(uint32 i=0; i<cJobNum; ++i)
	{
		SJobStringHandle newJobEntry;
		newJobEntry.cpString	= (const char*)(void*)(pJobStringHandleRep->jobStringStart + cPageRepAddr);
#if defined(SUPP_SPU_FRAME_STATS)
		for(uint32 j=0; j<JOB_FRAME_STATS_BUFS; ++j)
			m_pFrameProfileData[i*JOB_FRAME_STATS_BUFS+j].cpName	= newJobEntry.cpString;
#endif
		newJobEntry.strLen		= pJobStringHandleRep->jobStringSize;
		newJobEntry.jobHandle = (uint32)(pJobStringHandleRep->jobEntryOffset + cPageRepAddr);
//		((NPageBin::SJob*)(void*)newJobEntry.jobHandle)->jobNameEA = (unsigned int)newJobEntry.cpString;
		m_JobNameMap.insert(std::make_pair(newJobEntry.jobHandle,(unsigned int)newJobEntry.cpString));
		newJobEntry.jobId		  = i;
		m_JobStringOffsets[i] = pJobStringHandleRep;
		m_JobStringTable.push_back(newJobEntry);
		++pJobStringHandleRep;
	}
	std::sort(m_JobStringTable.begin(), m_JobStringTable.end());
}

const char* CJobManSPU::GetJobName(const uint32 cId)
{
	static char sJobName[64];
	assert(cId < m_JobStringOffsets.size());
	const NPageBin::SJobStringHandle *const cpJobStringHandleRep = m_JobStringOffsets[cId];
	const uint32 cPageRepAddr = (uint32)m_pSPURep;
	const uint32 cStrLen = cpJobStringHandleRep->jobStringSize;
	assert(cStrLen < sizeof(sJobName)-1);
	memcpy(sJobName, (const char*)(void*)(cpJobStringHandleRep->jobStringStart + cPageRepAddr), cStrLen);
	sJobName[cStrLen] = '\0';
	return sJobName;
}

void CJobManSPU::CreateSPUPageDir()
{
	const uint32 cSize = m_pSPURep->pageNum * sizeof(NSPU::SPageInfo);
	m_pPageDir = (NSPU::SPageInfo*)m_MallocFunc(cSize, 16);//align to 16 byte for DMA
	m_AllocatedMemory += cSize;
	m_PageInfo.pageDirEA = (uint32)m_pPageDir;
	m_PageInfo.pageNum = m_pSPURep->pageNum;
	//get page addresses into the directory
	NSPU::SPageInfo *pCurPageDir = m_pPageDir;
	const uint32 *pCurPageDirRep = (uint32*)((uint8*)m_pSPURep + m_pSPURep->pageOff);
	const uint32 cPageRepAddr = (uint32)m_pSPURep;
	const uint32 cPageNum = m_PageInfo.pageNum;
	for(uint32 i=0; i<cPageNum; ++i)
	{
		pCurPageDir->ea						= cPageRepAddr + *pCurPageDirRep++;
		NPageBin::SPage* pPage		= (NPageBin::SPage*)(void*)pCurPageDir->ea;
		pCurPageDir->size					= NPageBin::DecodePageSize(pPage->size);
		++pCurPageDir;
	}
	m_PageInfo.globalVarBaseAddr			= (int32)&CJobManSPU::scInitalSPULoader[0];
	m_PageInfo.gcmGlobalPPUContext		= m_CellGcmCurrentContext;
//	m_PageInfo.gcmGlobalPPUControlReg	= (uint32)(void*)cellGcmGetControlRegister();//put ptr is first
	m_PageInfo.gcmCmdAddressBase			= m_GcmAddressBase;
	m_PageInfo.ppuSyncEA							= (uint32)&m_SPUJobQueue;
	if(m_PageInfo.gcmCmdResetOffset == -1)//set if not set yet to some defaults
	{
		m_PageInfo.gcmCmdResetOffset		= CELL_GCM_INIT_STATE_OFFSET;
		m_PageInfo.gcmInjectBufOff			= CELL_GCM_INIT_STATE_OFFSET;
		m_PageInfo.gcmRsxBaseAddress		= 0xC0000000;
	}
}

const TJobHandle CJobManSPU::GetJobHandle(const char* cpJobName, const uint32 cStrLen) const
{
#if defined(USE_JOB_QUEUE_VERIFICATION)
	//create a buffer for the last failed job invocation
	static char sFailedBuf[128];
#endif
	static SJobStringHandle cFailedLookup = {"", 0, INVALID_JOB_HANDLE};
	const SJobStringHandle cLookup = {cpJobName, cStrLen, INVALID_JOB_HANDLE};
	const std::vector<SJobStringHandle>::const_iterator cEnd = m_JobStringTable.end();
	const std::vector<SJobStringHandle>::const_iterator cRes = std::find(m_JobStringTable.begin(), cEnd, cLookup);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cRes == cEnd)
	{
		if(cStrLen > 1)
		{
			PrintOut("Failed to obtain job handle for job: \"%s\"\n", cpJobName);
			for(std::vector<SJobStringHandle>::const_iterator it = m_JobStringTable.begin(); it != cEnd; ++it)
				PrintOut("  available jobs: \"%s\"\n", it->cpString);
			PrintOut("\n");
		}
		strncpy(sFailedBuf, cpJobName, std::min(cStrLen, (uint32)127));
		cFailedLookup.cpString = sFailedBuf;
		cFailedLookup.strLen = cStrLen;
		return (TJobHandle)&cFailedLookup;
	}
	else
		return (TJobHandle)&(cRes->cpString);
#else
	return (cRes == cEnd)?(TJobHandle)&cFailedLookup : (TJobHandle)&(cRes->cpString);
#endif
}

const bool CJobManSPU::InitSPUs(TSPUFreeFunc FreeFunc, TSPUMallocFunc MallocFunc, const int cSPUThreadCnt, const int cSPURSCnt, bool bEnablePrintf)
{
	m_bEnablePrintf = bEnablePrintf;
	m_FreeFunc		= FreeFunc;
	m_MallocFunc	= MallocFunc;
#if defined(SUPP_SN) 
	if(snIsDebuggerRunning())
	{
		if(snInit() != SUCCEEDED)
		{
			PrintOut("Failed to initialize libsn\n");
			return false;
		}
	}
#endif

	if(m_NumSPUAllowed == 0)
		return false;//something has gone wrong during initialization

	assert(!m_Initialized);

	//the 6th SPU can be used for SPUThreads and SPURS, if more are required, set m_NumSPUAllowed accordingly
#if defined(SUPPORT_SPURS)
	const int cOtherUsage = std::max(1, cSPUThreadCnt + cSPURSCnt);
	m_NumSPUAllowed = NPPU::scMaxSPU - (cOtherUsage-1);
#endif

	InitBucketMemory();

	m_pSPURep = (NPageBin::SHeader*)SPURepository;
	assert(m_pSPURep && ((uint32)m_pSPURep & 127) == 0);

#ifdef DO_SPU_FUNCPROFILING
	m_FuncProfVecClearCnt				= m_pSPURep->funcProfAreaSize / 128;
	m_pFuncProfilingArea				= (uint32*)m_MallocFunc(m_pSPURep->funcProfAreaSize, 128);
	m_pFuncProfilingAreaLastFrame = (uint32*)m_MallocFunc(sizeof(uint32) * m_pSPURep->funcProfCount, 4);
	memset(m_pFuncProfilingAreaLastFrame,0,sizeof(uint32) * m_pSPURep->funcProfCount);
	m_FuncProfStats.reserve(m_pSPURep->funcProfCount);
	UnlockFuncProfilingArea();
	m_pFuncProfStringTable			= (char*)m_pSPURep + m_pSPURep->funcProfStringOff;
	m_pFuncStringIndices				= (uint16*)((uint8*)m_pSPURep + m_pSPURep->funcProfStringIndOff);
	m_PageInfo.funcProfTimingEA	= (uint32)m_pFuncProfilingArea;
  memset(m_pFuncProfilingArea, 0, m_pSPURep->funcProfAreaSize);
	//ResetFuncProfilingArea();
#endif

		//build job string table
	CreatePageJobStringTable();	
	//build SPU page directory (page main mem address)
	CreateSPUPageDir();

	sys_spu_initialize(6, m_NumSPUAllowed);//init all SPUs

	m_SPUJobQueue.push.lockObtained				= 0;
	m_SPUJobQueue.push.baseAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[0];
	m_SPUJobQueue.push.topAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[scMaxWorkQueueJobs];//pointing right behind it
	m_SPUJobQueue.push.curAddr						= m_SPUJobQueue.push.baseAddr;

	m_SPUJobQueue.pull.lockObtained				= 0;
	m_SPUJobQueue.pull.baseAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[0];
	m_SPUJobQueue.pull.topAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[scMaxWorkQueueJobs];
	m_SPUJobQueue.pull.curAddr						= m_SPUJobQueue.pull.baseAddr;
	memset(m_SPUJobQueue.spuPacketSync, 0, SQueueNodeSPU::scSyncCount);
	m_CurSpuPacketSyncIndex = 127;//to be counted down to 1 then starting at 127 again..

	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		if(EAGAIN == sys_raw_spu_create((sys_raw_spu_t*)&m_SpuIDs[i], NULL))
		{
			m_NumSPUAllowed = i;
			PrintOut("Available number of SPUs has been reseted to: %d\n",i);
			break;
		}

#if defined(SUPP_SN)
		snRawSPUNotifyCreation(m_SpuIDs[i]);
#endif
		//reset all pending interrupts before starting
//		sys_raw_spu_set_int_stat(m_SpuIDs[i], 2, 0xFUL);
//		sys_raw_spu_set_int_stat(m_SpuIDs[i], 0, 0xFUL);
//		sys_raw_spu_set_spu_cfg(m_SpuIDs[i], 0);//no OR mode

		CreateRawSpuIntrHandler(m_SpuIDs[i], i);

		//run the SPU program, this will look at the job queue and start trying to get work to do
		LoadSPULoaderDriver(m_SpuIDs[i], i);
	}
	TestSPUs();//test if the SPUs have been started

	//init other usage SPUs
	int spursInitialized = 0;
	if(cSPURSCnt > 0)
	{
		int ret = cellSysmoduleLoadModule(CELL_SYSMODULE_SPURS);
		if(ret != CELL_OK) 
		{
			PrintOut("Failed to load SPURS module: 0x%x\n", ret);
			return false;
		}
		sys_ppu_thread_t currentThreadID;
		int currentThreadPriority;
		ret = sys_ppu_thread_get_id(&currentThreadID);
		if(ret != CELL_OK) 
		{
			PrintOut("Cannot get current thread ID (%d)\n", ret);
			return false;
		}
		ret = sys_ppu_thread_get_priority(currentThreadID, &currentThreadPriority);
		if(ret!=CELL_OK) 
		{
			PrintOut("Cannot get current thread priority (%d)\n", ret);
			return false;
		}
		const uint32 cSecPPUThreadPrio = (currentThreadPriority>0)?(currentThreadPriority - 1):0;
		ret = spu_printf_initialize(currentThreadPriority-1, NULL );
		CellSpursAttribute	spursAttrib;
		ret = cellSpursAttributeInitialize(&spursAttrib, 1, 250, cSecPPUThreadPrio, false);
		ret = cellSpursAttributeEnableSpuPrintfIfAvailable(&spursAttrib);
		ret = cellSpursInitializeWithAttribute(&m_SPURS, &spursAttrib);
//		ret = cellSpursInitialize(&m_SPURS, cSPURSCnt, 250, cSecPPUThreadPrio, false);//OBSOLETE
		if(ret != CELL_OK) 
		{
			PrintOut("Cannot initialize SPURS (%d)\n", ret);
			exit(-1);
		}
		spursInitialized = cSPURSCnt;
	}
	m_SpursInitialized = spursInitialized;

#if defined(_DEBUG)
	PrintOut("JobManager(debug): init %d raw SPUs, %d SPURS (SPU Driver: %.1f KB)\n", m_NumSPUAllowed, spursInitialized, (float)m_DriverSize / 1024.f);
#elif defined(DO_SPU_PROFILING)
	PrintOut("JobManager(profile): init %d raw SPUs, %d SPURS (SPU Driver: %.1f KB)\n", m_NumSPUAllowed, spursInitialized, (float)m_DriverSize / 1024.f);
#else
	PrintOut("JobManager(release): init %d raw SPUs, %d SPURS (SPU Driver: %.1f KB)\n", m_NumSPUAllowed, spursInitialized, (float)m_DriverSize / 1024.f);
#endif

	m_Initialized = true;
	return true;
}

//gets job slot for next job (to get storage index for SJobdata), waits until a job slots becomes available again since data get overwritten
const NPPU::EAddJobRes NPPU::CJobManSPU::GetSPUJobSlot(uint32& __restrict rJobSlot, uint32& __restrict rNextPush)
{
	//do changes in sync with SPU implementation in JobAPI_spu.cpp
	//wait til a job slot becomes available
	uint32 curPush;
	//get current push address according to if it is a depending job or not, replace following code by mask
	//	curPush = (m_RealCurPushAddress == ~0)?m_SPUJobQueue.push.curAddr : m_RealCurPushAddress;//get current push address
	const uint32 cCurPushMask = (uint32)(((int32)(-(m_RealCurPushAddress - ~0))) >> 31);
	curPush = m_RealCurPushAddress & cCurPushMask | m_SPUJobQueue.push.curAddr & ~cCurPushMask;

#if defined(USE_JOB_QUEUE_VERIFICATION)//assume we never hit it, enlarge it
	static const uint32 scMaxWaitLoopCount = 10;//after 10 loops of sleeping, it returns false
#endif

	while(1)//continue til we found an empty job slot
	{
		rNextPush = curPush + NSPU::NDriver::scSizeOfSJobQueueEntry;
		//start at index 0 if we reached the end, replace following code by mask
		//	if(rNextPush == m_SPUJobQueue.push.topAddr)	rNextPush = m_SPUJobQueue.push.baseAddr;
		const NSPU::NDriver::SJobQueuePos& __restrict crPush = m_SPUJobQueue.push;
		const uint32 cNextPushMask = (uint32)(((int32)(rNextPush - crPush.topAddr)) >> 31);
		rNextPush = rNextPush & cNextPushMask | crPush.baseAddr & ~cNextPushMask;

#if defined(USE_JOB_QUEUE_VERIFICATION)//assume we never hit it, enlarge it
		uint32 i = 0;//counter for idle loops
		while(rNextPush == m_SPUJobQueue.pull.curAddr)
		{
			sys_timer_usleep(1);//wait 1 microsecond
			++i;
			if(i == scMaxWaitLoopCount)
				return eAJR_EnqueueTimeOut;
		}
#endif
		//now check if job in push job slot has been finished, if not, increment push address and mark as not to be pulled next time
		rJobSlot = (uint32)((curPush - crPush.baseAddr) >> NSPU::NDriver::scSizeOfSJobQueueEntryShift);
		assert(rJobSlot < scMaxWorkQueueJobs);
		NSPU::NDriver::SInfoBlock& __restrict rJobInfoBlock	= m_SPUJobQueue.jobInfoBlocks[rJobSlot];
		if(__builtin_expect(!rJobInfoBlock.jobState.IsRunning(), true))
			return eAJR_Success;
#if defined(USE_JOB_QUEUE_VERIFICATION)
		NPPU::SJobData& __restrict rJobdata									= m_SPUJobQueue.jobData[rJobSlot];
		//job is still in progress, check timeout and get next one
	#if defined(SUPP_SN)
		if(!m_FrameDebuggingActive && JobTimeOutElapsed(rJobdata.jobStartTime))
	#else
		if(JobTimeOutElapsed(rJobdata.jobStartTime))
	#endif
		{
//		return eAJR_EnqueueTimeOutPushJob;
			const NSPU::NDriver::SInfoBlock& __restrict crExistInfoBlock	= m_SPUJobQueue.jobInfoBlocks[rJobSlot];
			const char *cpJobName = CJobManSPU::Instance()->GetJobName(crExistInfoBlock.jobId);
			PrintOut("Warning in JobManSPU: spu job: \"%s\" exceeded timout for occupying job slot\n",cpJobName);
		}
//		else
#endif
		{
			//mark entry as invalid, increment current push and next push address
			//this job remains in the queue
			rJobInfoBlock.SetFetchable(false);//not to be fetched this round (still running)
			curPush += NSPU::NDriver::scSizeOfSJobQueueEntry;
			//start at index 0 if we reached the end, replace following code by mask
			//	if(curPush == crPush.topAddr)	curPush = crPush.baseAddr;
			const uint32 cCurNextPushMask = (uint32)(((int32)(curPush - crPush.topAddr)) >> 31);
			curPush = curPush & cCurNextPushMask | crPush.baseAddr & ~cCurNextPushMask;
		}
	}
	return eAJR_Success;
}

const bool CJobManSPU::IsDebuggingActive() const
{
#if !defined(SUPP_SN)
	return false;
#else
	return m_DebuggingActive;
#endif
}

const EAddJobRes CJobManSPU::AddJob
(
	CSPUJobDel& __restrict crJob,
	const uint32 cOpMode,
	const uint8 cMinStackSizeKB,
	const TJobHandle cJobHandle
)
{
	const uint32 cExtJobAddr = (uint32)crJob.m_pJobState;
	__dcbt((const void*)cExtJobAddr);
//	assert(cJobHandle && cJobHandle->jobHandle != INVALID_JOB_HANDLE);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(!cJobHandle || cJobHandle->jobHandle == INVALID_JOB_HANDLE)
		return eAJR_InvalidJobHandle;
#else
	if(cJobHandle->jobHandle == INVALID_JOB_HANDLE)
		return eAJR_InvalidJobHandle;
#endif

	NPageBin::SJob *pJob = (NPageBin::SJob*)((void*)cJobHandle->jobHandle);

	void* __restrict pJobProgramData	= (void*)pJob;
	const uint32 cJobProgramSize			= ((uint32)pJob->totalJobSize << 2);//stored in multiple of 4 bytes
	assert(pJob->initialPages[0] >= 0);
	const uint16 cFirstPageIndex			= (uint16)pJob->initialPages[0];

	const uint32 cJobId								= cJobHandle->jobId;

#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(!m_Initialized)
		return eAJR_SPUNotInitialized;
#endif

	assert(pJobProgramData);

	const void *cpQueue		= crJob.GetQueue();
	const bool cNoQueue		= (cpQueue == NULL);
	const bool cKeepCache	= crJob.KeepCache();

	//setup for all spu packets
	const uint32 cOrigParamSize = crJob.GetParamDataSize();
	const uint8 cParamSize	= cOrigParamSize >> 4;
	const uint16 cJobSize		= cJobProgramSize >> 2;
	assert((cJobProgramSize & 3) == 0);
	const SCallback& __restrict crCallback = crJob.GetCallbackdata();

	NSPU::NDriver::SInfoBlock infoBlock;
	//reset info block
	infoBlock.depJobIndex						= NSPU::NDriver::SInfoBlock::scNoIndex;
	infoBlock.spuPacketSyncIndex		= NSPU::NDriver::SInfoBlock::scNoIndex;
	infoBlock.callbackIndex					= NSPU::NDriver::SInfoBlock::scNoIndex;
	infoBlock.minStackSize					= cMinStackSizeKB;
	const int cQueueAddr						= (int)cpQueue;
	const unsigned int cQueueMask		= (unsigned int)(((int)(-cQueueAddr)) >> 31);
	unsigned int flagSet						= ((unsigned int)NSPU::NDriver::SInfoBlock::scHasQueue & cQueueMask);
	flagSet												 |= NSPU::NDriver::SInfoBlock::scFetchable;
	infoBlock.eaExtJobStateAddress	= (unsigned int)cpQueue;
	infoBlock.flags									= (unsigned char)flagSet;
	const unsigned int cKeepCacheMask	= (unsigned int)(((int)(-(int)cKeepCache)) >> 31);
	infoBlock.flags2									= NSPU::NDriver::SInfoBlock::scKeepQueueCache & cKeepCacheMask;
	infoBlock.eaDMAJobAddress				= (uint32)pJobProgramData;
	infoBlock.jobSize								= cJobSize;
	infoBlock.paramSize							= cParamSize;
	infoBlock.SetOpMode(cOpMode);
	infoBlock.SetPageMode((NPPU::EPageMode)pJob->GetPageMode());
	infoBlock.SetPageMaxSize(pJob->GetMaxPageSize());
	infoBlock.curThreadId						= crJob.GetCurrentThreadId();
	infoBlock.bssSize								= pJob->bssSize;
	infoBlock.bssOff									= pJob->bssOffset;
	infoBlock.funcTableBinOff				= pJob->funcTableBinOff;
	infoBlock.funcTableSize					= pJob->funcTableSize;
#ifdef DO_SPU_FUNCPROFILING
	infoBlock.funcProfTimingCount		= pJob->funcProfCount;
#endif
	infoBlock.firstPageIndex				= cFirstPageIndex;
	infoBlock.jobId									= cJobId;

#if defined(SUPP_SPU_FRAME_STATS)
	infoBlock.frameProfIndex = (unsigned char)(cJobId * JOB_FRAME_STATS_BUFS + m_CurFrameBufIndex);
#endif

#if defined(SUPP_SN)
	if((uint32)pJobProgramData == (uint32)m_SPUJobDebugHandle)
	{
		m_SPUJobDebugHandle = (void*)0;//reset
		infoBlock.EnableDebug();
		infoBlock.EnableDriverDebug(m_SPUDriverDebuggingEnabled);
	}
#endif

	Lock();
	uint32 jobSlot, nextPush;
	const EAddJobRes cEnqRes = GetSPUJobSlot(jobSlot, nextPush);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cEnqRes != eAJR_Success)
		return cEnqRes;
#endif

	NSPU::NDriver::SInfoBlock& __restrict rInfoBlock	= m_SPUJobQueue.jobInfoBlocks[jobSlot];
	infoBlock.AssignMembersTo(&rInfoBlock);

	const CSPUPacketBase** __restrict ppSPUPackets;
	uint32 spuPacketCount = 0;
	uint32 extJobAddr = 0;
	//if a producer/consumer queue is used, do neither set parameter addresses nor add packets nor set external job state
	if(cNoQueue)
	{
		crJob.GetAllPackets(spuPacketCount, ppSPUPackets);//always valid pointer (static array)
		if(__builtin_expect(spuPacketCount > 0, false))
		{
			rInfoBlock.spuPacketSyncIndex			= m_CurSpuPacketSyncIndex--;
			assert(m_SPUJobQueue.spuPacketSync[rInfoBlock.spuPacketSyncIndex] == 0);//should be already counted down (semaphore like)
			m_SPUJobQueue.spuPacketSync[rInfoBlock.spuPacketSyncIndex] = spuPacketCount;//to be counted down
			//since we have index 0 reserved for locking, we need to decrement for branch free behavior
			//replace following code by mask:
			//	if(m_CurSpuPacketSyncIndex == 0)
			//		m_CurSpuPacketSyncIndex = NPPU::SQueueNodeSPU::scSyncMaxIndex;
			const uint32 cSyncMask = (uint32)(((int32)(-m_CurSpuPacketSyncIndex)) >> 31);
			m_CurSpuPacketSyncIndex = NPPU::SQueueNodeSPU::scSyncMaxIndex & ~cSyncMask | m_CurSpuPacketSyncIndex & cSyncMask;
		}

		//get DMA transfer entries from job base
		CreateDMAListSingle(cOrigParamSize, rInfoBlock.GetParamAddress(), crJob);

		if(cExtJobAddr != 0)
		{
			extJobAddr = cExtJobAddr;
			rInfoBlock.SetExtJobStateAddress(cExtJobAddr);
			crJob.m_pJobState->running = 1;//set running
		}
	}//cHasQueue

#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cJobProgramSize >= GetAvailableSPUSize())
		return eAJR_JobTooLarge;
#endif
	//register callback, use if to avoid cache miss otherwise
	void * __restrict pJobData = NULL;
	if(__builtin_expect(crCallback.pCallbackFnct != 0, false))
	{
		NPPU::SJobData& __restrict rJobdata = m_SPUJobQueue.jobData[jobSlot];
		rJobdata.callbackData			= crCallback;
		rInfoBlock.callbackIndex	= jobSlot;
		pJobData = &rJobdata;
	}	
	assert(jobSlot < 256);

#if defined(DO_SPU_PROFILING)
	rInfoBlock.eaJobPerfAddress = (uint32)(void*)crJob.m_pJobPerfData;
	rInfoBlock.SetTransferProfDataBack(m_pProfStatControl && (*m_pProfStatControl == 1));
#endif
	m_SPUJobQueue.push.curAddr = nextPush;//entry usage is safe til we increment push address so that pull address can fetch last set entry
	m_RealCurPushAddress = ~0;//reset
	const uint32 *const pInfoBlockRemaining	= (uint32*)&rInfoBlock.jobId;
	//now set up the spu packet jobs, first one has been made available already
	//dependent jobs can only be attached to the main job
	for(uint32 i=0; i<spuPacketCount; ++i)
	{
		const CSPUPacketBase& __restrict crSPUPacket = (const CSPUPacketBase&)*ppSPUPackets[i];

		const EAddJobRes cEnqRes = GetSPUJobSlot(jobSlot, nextPush);
	#if defined(USE_JOB_QUEUE_VERIFICATION)
		if(cEnqRes != eAJR_Success)
			return cEnqRes;
	#endif
		NSPU::NDriver::SInfoBlock& __restrict rAddInfoBlock	= m_SPUJobQueue.jobInfoBlocks[jobSlot];
		rAddInfoBlock.eaDMAJobAddress		= (uint32)pJobProgramData;
		*(uint32*)&rAddInfoBlock.minStackSize = *(uint32*)&rInfoBlock.minStackSize;
//		rAddInfoBlock.flags2						= rInfoBlock.flags2;
		rAddInfoBlock.SetExtJobStateAddress(extJobAddr);
#if defined(DO_SPU_PROFILING)
		rAddInfoBlock.eaJobPerfAddress = 0;//only available for main job
		rAddInfoBlock.SetTransferProfDataBack(false);//will only be set for 1 SPU per job
#endif
		//perform fast copy op
		uint32 *pAddInfoBlockRemaining	= (uint32*)&rAddInfoBlock.jobId;
		const uint32 cVal[7] = 
			{pInfoBlockRemaining[0],	pInfoBlockRemaining[1], pInfoBlockRemaining[2], 
			pInfoBlockRemaining[3], pInfoBlockRemaining[4], pInfoBlockRemaining[5], pInfoBlockRemaining[6]};

		pAddInfoBlockRemaining[0]				= cVal[0];
		pAddInfoBlockRemaining[1]				= cVal[1];
		pAddInfoBlockRemaining[2]				= cVal[2];
		pAddInfoBlockRemaining[3]				= cVal[3];
		pAddInfoBlockRemaining[4]				= cVal[4];
		pAddInfoBlockRemaining[5]				= cVal[5];
		pAddInfoBlockRemaining[6]				= cVal[6];

		rAddInfoBlock.depJobIndex				= NSPU::NDriver::SInfoBlock::scNoIndex;
		//apply callback, since only one raises it, copy from main job
		if(pJobData)
		{
			NPPU::SJobData& __restrict rJobdata = *(NPPU::SJobData*)pJobData;
			NPPU::SJobData& __restrict rAddJobdata = m_SPUJobQueue.jobData[jobSlot];
			rAddJobdata.callbackData						= rJobdata.callbackData;
		}
		const CSPUJobDel& __restrict crJobDel = crSPUPacket.m_JobDelegator;
		CreateDMAListSingle(cOrigParamSize, rAddInfoBlock.GetParamAddress(), crJobDel);

	#if defined(USE_JOB_QUEUE_VERIFICATION)
		if(cJobProgramSize >= GetAvailableSPUSize())
			return eAJR_JobTooLarge;
	#endif

		//mark as fetchable
		rAddInfoBlock.SetFetchable(true);
		m_SPUJobQueue.push.curAddr	= nextPush;
		rAddInfoBlock.jobState.running = 1;
	}//SPU packet loop

	UnLock();

#if defined(USE_JOB_QUEUE_VERIFICATION)
	{
		NPPU::SJobData& __restrict rJobdata = m_SPUJobQueue.jobData[jobSlot];
		GetTimeTB(rJobdata.jobStartTime);
	}
#endif

	return eAJR_Success;
}

int CJobManSPU::VerifySPUs(const bool cIgnoreDebugState) const
{
	if(!cIgnoreDebugState && IsDebuggingActive())
		return 0;
	int detectedFailure = 0;
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		const uint32 cStatus = sys_raw_spu_mmio_read(m_SpuIDs[i], scPCSPUStatus);
		if((cStatus & 0xFFFF0000) == 0x3FFF0000/*executed STOPD instr*/ )
		{
//			if(m_pLog)
//				m_pLog->LogError("SPU id=%d has executed STOPD instruction\n",i);
#if defined(SUPP_SN)
//			if(!IsDebuggingActive())
#endif
			PrintOut("SPU id=%d has executed STOPD instruction\n",i);
			detectedFailure = -1;
		}
		if(!(sys_raw_spu_mmio_read(m_SpuIDs[i], SPU_RunCntl) & 0x1))
		{
			//			if(m_pLog)
			//				m_pLog->LogError("SPU id=%d has run control 0\n",i);
#if defined(SUPP_SN)
//			if(!IsDebuggingActive())
#endif
				PrintOut("SPU id=%d has run control 0\n",i);
			detectedFailure = -1;
		}
		const int scStatusMask = SPU_STATUS_RUNNING | SPU_STATUS_WAITING_FOR_CHANNEL | SPU_STATUS_SINGLE_STEP;
		if(!(cStatus & scStatusMask))
		{
//			if(m_pLog)
//				m_pLog->LogError("SPU id=%d has been halted\n",i);
#if defined(SUPP_SN)
//			if(!IsDebuggingActive())
			{				
				NPPU::WriteSPUProbReg(i, NPPU::scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
#endif
				PrintOut("SPU id=%d has been halted\n",i);
#if defined(SUPP_SN)
			}
#endif
			detectedFailure = -1;
		}
		if(detectedFailure != 0)
		{
			PrintOut("Job=%s  PC=0x%08x\n",RetrieveCurrentSPUJob(i),ReadSPUProbReg(i, scPCSPUNPC)-4);
		}
	}
#if defined(SUPP_SN)
//	if(IsDebuggingActive() && !detectedFailure)
//		m_DebuggingActive = false;
#endif
	return detectedFailure;
}

void CJobManSPU::TestSPUs()
{
	const int cDetectedFailure = VerifySPUs();
	
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cDetectedFailure != 0)
	{
		//restart all SPUs, clear command queue
		PrintOut("JobManager: resetting job queue and restarting all SPUs\n");
		Lock();
#if defined(SUPP_SN)
		SetDebuggingActive(false);
#endif
		//reset info blocks
		memset(m_SPUJobQueue.jobInfoBlocks, 0, sizeof(m_SPUJobQueue.jobInfoBlocks));
		//reset job queue
		m_SPUJobQueue.push.lockObtained				= 0;
		m_SPUJobQueue.push.curAddr						= m_SPUJobQueue.push.baseAddr;
		m_SPUJobQueue.pull.lockObtained				= 0;
		m_SPUJobQueue.pull.curAddr						= m_SPUJobQueue.pull.baseAddr;
		memset(m_SPUJobQueue.spuPacketSync, 0, SQueueNodeSPU::scSyncCount);
		m_CurSpuPacketSyncIndex = 127;//to be counted down to 1 then starting at 127 again..
		for(uint32 i=0; i<m_NumSPUAllowed; ++i)
		{
			if(SUCCEEDED != sys_raw_spu_destroy(m_SpuIDs[i]))
			{
				if(m_pLog)
					m_pLog->LogError("TestSPUs: failed to destroy SPU id: %d\n",m_SpuIDs[i]);
				PrintOut("TestSPUs: failed to destroy SPU id: %d\n",m_SpuIDs[i]);
			}
			if(SUCCEEDED != sys_raw_spu_create((sys_raw_spu_t*)&m_SpuIDs[i], NULL))
			{
				if(m_pLog)
					m_pLog->LogError("TestSPUs: failed to recreate SPU id: %d\n",m_SpuIDs[i]);
				PrintOut("TestSPUs: failed to recreate SPU id: %d\n",m_SpuIDs[i]);
			}
			LoadSPULoaderDriver(m_SpuIDs[i], i);
		}
		UnLock();
	}
#endif
}

const bool CJobManSPU::SPUJobsActive() const
{
	//go through all jobs and check their state
	NSPU::NDriver::SInfoBlock * const pJobInfoBlocks = (NSPU::NDriver::SInfoBlock*)m_SPUJobQueue.jobInfoBlocks;
	for(unsigned int i=0; i<scMaxWorkQueueJobs; ++i)
	{
		if(pJobInfoBlocks[i].jobState.IsRunning())
			return true;
	}
	return false;
}

void CJobManSPU::InitBucketMemory()
{
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;

	//number of memory blocks per bucket size
	//to make header and list be part of the same cacheline, make sure the aggregated count is below 128
	//memory is allocated in one block to apply the async write back functionality
	static const uint8 scBucketNumTable[NSPU::SBucketInfo::scBucketCount] = 
	{
		64 - SBucketHeader::scBucketHeaderSize,//32 byte buckets
		64 - SBucketHeader::scBucketHeaderSize,//64 byte buckets
		64 - SBucketHeader::scBucketHeaderSize,//128 byte buckets
		64	- SBucketHeader::scBucketHeaderSize,//256 byte buckets
		32	- SBucketHeader::scBucketHeaderSize,//512 byte buckets
		32	- SBucketHeader::scBucketHeaderSize,//1024 byte buckets
		8	- SBucketHeader::scBucketHeaderSize,//2048 byte buckets
		8	- SBucketHeader::scBucketHeaderSize	//4096 byte buckets
	};

	//first count the totally required memory per SPU and allocate in one chunk
	uint32 totalSize = 0;
	uint32 curTotalSize = totalSize;
	for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
	{	
		const uint32 cBucketHeaderSize = SBucketHeader::scBucketHeaderSize + scBucketNumTable[j] + scBucketNumTable[j] * sizeof(uint32);
		totalSize += cBucketHeaderSize;
		curTotalSize = totalSize;
		totalSize = (totalSize + 127) & ~127;//must be 128 byte aligned
	}
	totalSize = curTotalSize;//revert last alignment change
	totalSize += sizeof(uint32)*SBucketInfo::scFreedMaxCount;//memory used for memory release tracking
	totalSize = (totalSize+127)&~127;

	//for each SPU initialize and allocate all buckets
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		uint8 *pMem = (uint8*)m_MallocFunc(totalSize, 128);
		m_pBucketDirMem[i] = pMem;
		m_AllocatedMemory += totalSize;
		SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(i))->bucketInfo;
		NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
		uint32 bucketSize = NSPU::SBucketInfo::scBucketSizeMin;

		rBucketInfo.freedCount = 0;
		uint8 *pCurMem = pMem;

		for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
		{	
			//allocate entire bucket header at once
			pMem = (uint8*)(((uint32)pCurMem + 127) & ~127);//each header must be aligned
			const uint8 cNumBlocks = scBucketNumTable[j];	
			const uint32 cBucketHeaderSize = 
				SBucketHeader::scBucketHeaderSize + //header
				cNumBlocks +						//single linked block list
				cNumBlocks * sizeof(uint32);//block directory
			SBucketHeader* pBuckHeader = (SBucketHeader*)pMem;

			//set up bucket size and pointer
			pBuckets[j].pBucketHeader = pBuckHeader;
			pBuckets[j].available			= cNumBlocks;
			pBuckets[j].size					= bucketSize;
			pBuckets[j].numTotal			= cNumBlocks;
			//set up bucket header
			pBuckHeader->listIndex		= 0;//make index 0 the first available bucket
			pBuckHeader->listIndexEnd = cNumBlocks-1;//last available index, required to append new available ones after reload
			pBuckHeader->dirIndex			= BUCKET_NULL;//no applied slots initially
			pBuckHeader->dirIndexEnd	= BUCKET_NULL;//no applied slots initially
			uint8 *pLinkedList = (uint8*)pBuckHeader + SBucketHeader::scBucketHeaderSize;
			//set up linked list, always pointing to next one
			for(uint32 b=0; b<cNumBlocks-1; ++b)
				pLinkedList[b] = (uint8)(b+1);
			pLinkedList[cNumBlocks-1] = BUCKET_NULL;
			assert(sizeof(SBucketDir) == sizeof(uint32));
			SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckHeader + SBucketHeader::scBucketHeaderSize + cNumBlocks);
			//set up directory by allocating the individual buckets, always 16 byte aligned
			for(uint32 b=0; b<cNumBlocks; ++b)
			{
				pDir[b].address	= (uint32)(m_MallocFunc(bucketSize, std::min((uint32)128, bucketSize)));//align to bucketSize (128 max)
				m_AllocatedMemory += bucketSize;
			}
			pCurMem = (uint8*)(&pDir[cNumBlocks]);//free info is right behind last bucket directory
			bucketSize <<= 1;//always power of 2
		}
		rBucketInfo.pFreedList = (uint32*)pCurMem;
		memset(rBucketInfo.pFreedList, 0, sizeof(uint32)*SBucketInfo::scFreedMaxCount);
	}
}

void CJobManSPU::UpdateSPUMemManSingleSPU(const uint32 cSPUIndex, const bool cIgnoreProcessing)
{
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;
	SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(cSPUIndex))->bucketInfo;
	const uint32 cFreeCount = rBucketInfo.freedCount;
	if(!cIgnoreProcessing && IsSPUProcessing(cSPUIndex))
		return;
	if(cFreeCount != 0)
	{
		//delete from SPU released memory
		//since we do not know the bucket it comes from (would require support from memory manager)
		//	we can just delete the buckets and refill the missing ones
		for(uint32 f=0; f<cFreeCount; ++f)
		{
			//must call free func from engine
			m_FreeFunc((void*)(rBucketInfo.pFreedList[f]));
		}
		rBucketInfo.freedCount = 0;
	}
	//refill buckets
	NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
	for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
	{	
		SBucket& rBucket = pBuckets[j];
		if(rBucket.available == rBucket.numTotal)
			continue;//nothing to do for this bucket
		SBucketHeader& rBuckHeader = *rBucket.pBucketHeader;
		SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckets[j].pBucketHeader + 
			SBucketHeader::scBucketHeaderSize + rBucket.numTotal);
		uint8 *pLinkedList = (uint8*)rBucket.pBucketHeader + SBucketHeader::scBucketHeaderSize;
		rBucket.available = rBucket.numTotal;//reset count
		//go through available free list stored through directory entries and link new entries to end of linked list
		uint8 curIndex = rBuckHeader.dirIndex;//first index
		const uint8 cFirstRefilledIndex = curIndex;//save first index for post loop
		const uint8 cOldFirstIndex = rBuckHeader.listIndex;//save due to aliasing
		uint8 lastCurIndex = rBuckHeader.listIndexEnd;//if nothing is to refill, it remains the same
		//we have link the curIndex to the rBuckHeader.listIndexEnd
		//	if it was zero before, link to dummy for the first time
		uint8 *pLinkEntry = &pLinkedList[rBuckHeader.listIndexEnd];
		uint8 dummy;
		//	apply branch free: if(rBuckHeader.listIndexEnd == BUCKET_NULL) pLinkEntry = &dummy;
		const uint32 cLinkEntryMask = (uint32)(((int32)(rBuckHeader.listIndexEnd - BUCKET_NULL)) >> 31);
		pLinkEntry = (uint8*)(((uint32)&dummy) & ~cLinkEntryMask | ((uint32)pLinkEntry & cLinkEntryMask));
		while(curIndex != BUCKET_NULL)
		{
			SBucketDir& rCurDirEntry = pDir[curIndex];//get directory entry
			lastCurIndex = curIndex;//store last curIndex to apply to rBuckHeader.listIndexEnd in post loop
			curIndex = rCurDirEntry.GetLinkIndex();//get next index from directory free list
			rCurDirEntry.address = (uint32)(m_MallocFunc(rBucket.size, std::min((uint32)128, rBucket.size)));
			*pLinkEntry = lastCurIndex;//link to current block
			pLinkEntry  = &pLinkedList[lastCurIndex];//for next loop linkage			
		}
		rBuckHeader.listIndexEnd = lastCurIndex;//mark current index as last index
		//if we had allocated all blocks, reset first index, apply branch free:
		//	if(rBuckHeader.listIndex == BUCKET_NULL) rBuckHeader.listIndex = cFirstRefilledIndex
		const uint32 cListIndexMask = (uint32)(((int32)(cOldFirstIndex - BUCKET_NULL)) >> 31);
		rBuckHeader.listIndex = cFirstRefilledIndex & ~cListIndexMask | cOldFirstIndex & cListIndexMask;

		//reset header
		rBuckHeader.dirIndex		= BUCKET_NULL;
		rBuckHeader.dirIndexEnd	= BUCKET_NULL;
	}
}

void CJobManSPU::UpdateSPUMemMan()
{
#if defined(SUPP_SN)
	if(IsDebuggingActive())
	{
		//all spu jobs have been finished, debugging must have been disabled
		if(!SPUJobsActive())
			SetDebuggingActive(false);
		return;
	}		
#endif
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;
	//stall til all SPU jobs have been finished (important to have all caches flushed)
#if defined(_DEBUG)
	bool issueErrorMessage = true;
#endif
#if defined(USE_JOB_QUEUE_VERIFICATION)
	uint32 loops = 0;
#endif
	Lock();
#if defined(SUPP_SN)
	m_FrameDebuggingActive = false;//reset
#endif
	//release memory specified by SPUs and refill buckets
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
		UpdateSPUMemManSingleSPU(i);
	UnLock();
}
	
void CJobManSPU::ShutDown()
{
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;

	if(m_pLog)
		m_pLog->Log("Shutting down SPUs...");
	else
		PrintOut("Shutting down SPUs...");
	UpdateSPUMemMan();

	//free all allocated memory
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(i))->bucketInfo;
		NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
		for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
		{	
			const SBucketHeader& crBuckHeader = *pBuckets[j].pBucketHeader;
			SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckets[j].pBucketHeader + 
				SBucketHeader::scBucketHeaderSize + pBuckets[j].numTotal);

			for(uint32 b=0; b<pBuckets[j].numTotal; ++b)
				if(!pDir[b].IsUnused())
					m_FreeFunc((uint8*)pDir[b].address);
		}
	}

	for(uint32 i=0; i<NPPU::scMaxSPU; ++i)
		m_FreeFunc(m_pBucketDirMem[i]);

	if(m_pPageDir)
		m_FreeFunc((uint8*)m_pPageDir);

#ifdef DO_SPU_FUNCPROFILING
	m_FreeFunc((uint8*)m_pFuncProfilingArea);
	m_FreeFunc((uint8*)m_pFuncProfilingAreaLastFrame);
#endif

	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		if(SUCCEEDED != sys_interrupt_thread_disestablish(m_SpuIntHandle[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to disestablish interrupt thread for SPU id: %d\n",m_SpuIDs[i]);
			PrintOut("Failed to disestablish interrupt thread for SPU id: %d\n",m_SpuIDs[i]);
		}
		if(SUCCEEDED != sys_interrupt_tag_destroy(m_SpuIntTags[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to destroy interrupt tag for SPU id: %d\n",m_SpuIDs[i]);
			PrintOut("Failed to destroy interrupt tag for SPU id: %d\n",m_SpuIDs[i]);
		}
		if(SUCCEEDED != sys_raw_spu_destroy(m_SpuIDs[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to destroy SPU id: %d\n",m_SpuIDs[i]);
			PrintOut("Failed to destroy SPU id: %d\n",m_SpuIDs[i]);
		}
#if defined(SUPP_SN)
		snRawSPUNotifyDestruction(m_SpuIDs[i]);
#endif
	}
#if defined(SUPP_SPU_FRAME_STATS)
	delete [] m_pFrameProfileData;
#endif
	if(m_SpursInitialized)
	{
		cellSpursFinalize(&m_SPURS);
		spu_printf_finalize();		
		cellSysmoduleUnloadModule(CELL_SYSMODULE_SPURS);
	}
	PrintOut("done\n");
};

void CJobManSPU::GetAndResetSPUFuncProfStats(const SFrameProfileData*& rpCurFuncProfStatVec, uint32& rCount, const uint32 cThresholdUSecs)
{
#ifdef DO_SPU_FUNCPROFILING
	//iterate all jobs and retrieve functions above threshold and set name pointer
	m_FuncProfStats.resize(0);
	const std::vector<SJobStringHandle>::const_iterator cEnd = m_JobStringTable.end();
	LockFuncProfilingArea();
	uint32 *pLastFrameDatasPPU = m_pFuncProfilingAreaLastFrame;
	for(std::vector<SJobStringHandle>::const_iterator it=m_JobStringTable.begin();it!=cEnd;++it)
	{
		const SJobStringHandle& __restrict crJobStringHandle = *it;
		const NPageBin::SJob *const __restrict pJob = (NPageBin::SJob*)((void*)crJobStringHandle.jobHandle);		
		const uint32 cFuncProfCount = pJob->funcProfCount;
		const uint32 *__restrict pJobTimings = &m_pFuncProfilingArea[pJob->funcProfTimingOff128*(128/sizeof(uint32))];
    if (pJobTimings <= m_pFuncProfilingArea || pJobTimings > (uint32_t*)((char*)m_pFuncProfilingArea + m_pSPURep->funcProfAreaSize)) { __asm__ volatile ("tw 31,1,1\n" :::); }; 
		for(uint32 i=0; i<cFuncProfCount; ++i)
		{
			const uint32 cFuncTiming = *pJobTimings++;
			const uint32 cFuncCount	 = *pJobTimings++;
			const uint32 cFuncTimingLastFrame = *pLastFrameDatasPPU;
			*pLastFrameDatasPPU++ = cFuncTiming;
			if(cFuncTiming > cThresholdUSecs)
			{
				SFrameProfileData stat;
				stat.usec			= cFuncTiming;
				stat.usecLast	= cFuncTimingLastFrame;
				stat.count		= cFuncCount;
				stat.cpName		= &m_pFuncProfStringTable[m_pFuncStringIndices[pJob->funcProfIndStart+i]];
				m_FuncProfStats.push_back(stat);
			}
		}
	}
	ResetFuncProfilingArea();
	UnlockFuncProfilingArea();
	std::sort(m_FuncProfStats.begin(),m_FuncProfStats.end());
	rpCurFuncProfStatVec = &m_FuncProfStats[0];
	rCount = m_FuncProfStats.size();
#else
	rCount = 0;
#endif
}

//obtains and resets the SPU stats of the last frame
void CJobManSPU::GetAndResetSPUFrameStats(NPPU::SSPUFrameStats& rStats, const bool cReset)
{
#if defined(SUPP_SPU_FRAME_STATS)
	m_MemAreaMan.GetSPUFrameStats(rStats);
	if(cReset)
		m_MemAreaMan.ResetStats(/*m_SPUJobQueue.pull.lockObtained-1*/);	
#endif
}

void CJobManSPU::GetAndResetSPUFrameStats(SSPUFrameStats& rStats, const SFrameProfileData*& rpCurFrameProfVec, uint32& rCount)
{
#if defined(SUPP_SPU_FRAME_STATS)
	const uint32 cFrontIndex = m_CurFrameBufIndex;
	uint32 backIndex			= (cFrontIndex + 1);		backIndex = (backIndex > (JOB_FRAME_STATS_BUFS-1))?0:backIndex;
	m_MemAreaMan.GetSPUFrameStats(rStats);
	m_MemAreaMan.ResetStats(/*m_SPUJobQueue.pull.lockObtained-1*/);
	const uint32 cJobNum = m_JobNum;
	std::vector<SFrameProfileData>& __restrict rCurFrameProfVec = m_CurFrameProfVec[backIndex];
	//copy all elements and reset
	uint32 curCount = 0;
	for(uint32 i=0; i<cJobNum; ++i)
	{
		SFrameProfileData& __restrict rFrameStats = m_pFrameProfileData[i*JOB_FRAME_STATS_BUFS+backIndex];
		if(rFrameStats.count > 0)
		{
			rCurFrameProfVec[curCount++] = rFrameStats;
			rFrameStats.Reset();
		}
	}
	rCount = curCount;
	std::sort(rCurFrameProfVec.begin(), rCurFrameProfVec.end());
	rpCurFrameProfVec = &rCurFrameProfVec[0];
	m_CurFrameBufIndex		= backIndex;
#endif//SUPP_SPU_FRAME_STATS
}

void CJobManSPU::EnableSPUDriverDebugging(const bool cEnable)
{
#if defined(SUPP_SN)
	m_SPUDriverDebuggingEnabled = cEnable;
#endif
}

void CJobManSPU::EnableSPUJobDebugging(void* cJobHandle)
{
#if defined(SUPP_SN)
	m_SPUJobDebugHandle = cJobHandle?(void*)((NPPU::TJobHandle)cJobHandle)->jobHandle : NULL;
#endif
}

namespace NPPU
{
	int PrintOut( const char *fmt, ... )
	{
		if(!CJobManSPU::Instance()->IsPrintfEnabled())
			return 0;

		va_list args;
		va_start(args, fmt);
		int ret = vprintf(fmt, args);
		va_end(args);
		return ret;
	}
}

#undef MAX_ITER

#endif //PS3
