/*
	implementation of spu job manager

	DMA memory mappings can be issued in any order
*/

#if defined(PS3)

#define eCryModule eCryM_Launcher
#include <CryModuleDefs.h>
#include <ILog.h>
#include "JobManSPU.h"
#include <raw_spu.h>
#include "PPU.h"
#include "../SPU/Elf.h"
#include "SPUJobBase.h"
#include <limits>
#include <sys/timer.h>
#include <sys/spu_initialize.h>	//for sys_spu_initialize
#include <sys/spu_utility.h>
#include <sys/ppu_thread.h>
#include <sys/spu_thread.h>
#include <sys/interrupt.h>
#include <sys/time_util.h>
#include <sys/sys_time.h>
#include <sys/paths.h>
#include <cell/sysmodule.h>
#include <cell/cell_fs.h>
#include <ppu_intrinsics_gcc.h>
#if defined(SUPP_SN)
	#include <lib/libsn.h>
#endif
#include "../SPU/CodePage/BubbleLayout.h"

#include <spu_printf.h>

#if defined(_DEBUG)
	#define MAX_ITER 2000000
#else
	#define MAX_ITER 40000000
#endif

//external symbol for the char array of code
//binary entry symbol main will be renamed to SPUDriver
//code is added to the ppu object code via ppu-lv2-objcopy
extern char SPUDriver[];
NSPU::SLoaderParams NPPU::CJobManSPU::scSPULoaderParam _ALIGN(4096);
uint32 NPPU::CJobManSPU::scInitalSPULoader[NSPU::scLoaderTextSizeBytes >> 2] _ALIGN(128) = SPU_LDR_TEXT;

namespace
{
	inline const bool JobTimeOutElapsed(const uint64_t cStartTime)
	{
		static const uint64_t scMaxPeriod = (uint64_t)(10000.f /*10 milliseconds*/ * (float)(sys_time_get_timebase_frequency() / (1000 * 1000)));
		uint64_t curTime;
		NPPU::GetTimeTB(curTime);
		return ((curTime -  cStartTime) > scMaxPeriod);
	}
}

using namespace NPPU;

CJobManSPU* CJobManSPU::Instance()
{
	static CJobManSPU inst(SPUDriver);
	return &inst;
}

ILog* GetILog()
{
	return CJobManSPU::Instance()->GetLog();
}

void CJobManSPU::SetLog(ILog *pLog)
{
	m_pLog = pLog;
}

void CJobManSPU::SyncMFCCmd
(
	const uint32 cSPUId, 
	const uint32 cDMATag
) const
{
	const uint32 cTagMask = 0x1 << cDMATag;
	sys_raw_spu_mmio_write(cSPUId, Prxy_QueryMask, cTagMask);
	do 
	{
		Eieio();
	} 
	while((sys_raw_spu_mmio_read(cSPUId, Prxy_TagStatus) & cTagMask) == 0);
}

void CJobManSPU::SendMFCCmd
(
	const uint32 cSPUId, 
	const uint32 cLS, 
	const uint32 cEA, 
	const uint32 cSize, 
	const uint32 cDMATag, 
	const uint32 cCommand
) const
{
	//		const uint32 cEAh = (uint32)((uint64)(cEA)>>32);
	const uint32 cEAh = 0;
	const uint32 cEAl = (uint32)cEA;
	//set the DMA parameters to appropriate registers via MMIO, check the DMA command status by reading the DMACMDStatus register
	//if the status is 0, the DMA command enqueue has succeeded, otherwise, the DMA parameter must be set again
	uint32 dmaStat = 1;
	WriteSPUProbReg(cSPUId, scPCDMALSA, cLS);
	WriteSPUProbReg(cSPUId, scPCDMAEAH, cEAh);
	WriteSPUProbReg(cSPUId, scPCDMAEAL, cEAl);
	WriteSPUProbReg(cSPUId, scPCDMASizeTag, (cSize << 16) | cDMATag);
	do 
	{
		WriteSPUProbReg(cSPUId, scPCDMAClassCMD, cCommand);
		dmaStat = ReadSPUProbReg(cSPUId, scPCDMACMDStatus);
	}while(dmaStat != 0);
}

CJobManSPU::CJobManSPU(void* __restrict pDriver) 
	: m_pElfInfo(NULL), m_NumSPUAllowed(scMaxSPU), m_DriverSize(0), 
		m_Initialized(false), m_pLastAddedInfoBlock(NULL), m_pLog(NULL)
{
	m_pSPURep			= NULL;
	m_pBubbleDir	= NULL;
	m_RealCurPushAddress = ~0;
#if !defined(SUPP_SN)
	const NSPU::NElf::EParseResult cRes = NSPU::NElf::ParseElf(pDriver, m_pElfInfo);		//parse spu-elf
	switch(cRes)
	{
		case NSPU::NElf::ePR_NoElf:				
			printf("ERROR: SPU Driver is no valid ELF-file\n");
			break;
		case NSPU::NElf::ePR_NoSPUElf:		
			printf("ERROR: SPU Driver is an ELF-file but no valid SPU-ELF-file\n");
			break;
		case NSPU::NElf::ePR_ElfTooBig:		
			printf("ERROR: SPU Driver is too big to be uploaded\n");
			break;
		case NSPU::NElf::ePR_NoQWAddress:		
			printf("ERROR: SPU Driver image start is not on a quadword address\n");
			break;
		case NSPU::NElf::ePR_Success:
		default:
			break;
	}
	if(cRes != NSPU::NElf::ePR_Success)
	{
		m_NumSPUAllowed = 0;//make sure its not getting uploaded
		printf("ERROR: SPU Driver elf could not been parsed\n");
		exit(1);
	}

	m_DriverSize = NSPU::AlignSize16(m_pElfInfo->LSDestination + m_pElfInfo->imageSize);
#else
	m_pElfInfo = (spu_mod_hdr*)pDriver;
	m_DriverSize = NSPU::AlignSize16(m_pElfInfo->pad);
#endif

	m_SpinLock = 0;//init spin lock

#if defined(SUPP_SN)
	m_SPUJobDebugHandle					= (void*)0;
	m_SPUDriverDebuggingEnabled = false;
	m_DebuggingActive						= false;
#endif

#if defined(SUPP_SPU_FRAME_STATS)
	m_CurFrameProfDataIndex = 0;
#endif

#if defined(DO_SPU_PROFILING)
	m_pProfStatControl = NULL;
#endif
}

const bool CJobManSPU::WaitSPUJob(volatile NSPU::NDriver::SExtJobState& rJobState) const
{
	volatile uint32 loops = 0;
	while(rJobState.IsRunning())
	{
		if(!IsDebuggingActive() && ++loops > MAX_ITER)
		{
			if(m_pLog)
				m_pLog->LogError("Timeout in WaitSPUJob (waiting for job state)\n");
			else
				printf("\nTimeout in WaitSPUJob (waiting for job state)\n");
#if defined(SUPP_SN)
			printf("Stopping all SPUs\n");
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
			}
			while(rJobState.IsRunning()){}
			printf("Continuing all SPUs\n");
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 1);
				snRawSPUNotifySPUStarted(i);
			}
			return true;
#else
			return false;
#endif
		}
		for(uint32 i=0; i<32; ++i)
			asm volatile("nop");
	}
	return true;
}

void CJobManSPU::PrintPerfStats(const volatile NSPU::NDriver::SJobPerfStats* pPerfStats, const char* cpJobName) const
{
#if defined(DO_SPU_PROFILING)
	if(pPerfStats && m_pProfStatControl && *m_pProfStatControl)
	{
		const float cFrequFactor	= 1000000.f / (float)NPPU::GetTimeBaseFrequency();
		const float cCycleFactor = (float)NPPU::GetCPUFrequency() / (float)NPPU::GetTimeBaseFrequency();
		printf("\n\n------------------Begin JobStatistics for \"%s\"------------------\n\n",cpJobName);
		printf("SPU driver time:       %.2f usec\n", (float)pPerfStats->spuSetupTime * cFrequFactor);
		printf("SPU job fetch time:    %.2f usec\n", (float)pPerfStats->spuFetchTime * cFrequFactor);
		const float cUsecsJobTime = (float)pPerfStats->spuJobTime * cFrequFactor;
		if(cUsecsJobTime > 1000)
			printf("SPU job time:          %.2f ms\n", cUsecsJobTime * 0.001f);
		else
			printf("SPU job time:          %.2f usec\n", cUsecsJobTime);
		const float cUsecsMissTime = (float)pPerfStats->spuCacheMissTime * cFrequFactor;
		if(cUsecsMissTime > 1000)
			printf("SPU cache miss time:   %.2f ms\n", cUsecsMissTime * 0.001f);
		else
			printf("SPU cache miss time:   %.2f usec\n", cUsecsMissTime);
		printf("\nSPU cache hits:        %d\n",pPerfStats->cacheHits);
		printf("SPU cache misses:      %d\n",pPerfStats->cacheMisses);
		printf("SPU cache flush: write backs sync:  %d\n",pPerfStats->cacheWritesBackSync);
		printf("SPU cache flush: write backs async: %d\n",pPerfStats->cacheWritesBackASync);
		printf("SPU cache flush: no write back:     %d\n",pPerfStats->cacheFlushsNoWrite);
		printf("SPU lost lines (write back):    %d\n",pPerfStats->lostLineEvents);
		printf("SPU prefetch cache hits:        %d\n",pPerfStats->prefetchHits);
		printf("SPU prefetch cache misses:      %d\n",pPerfStats->prefetchMisses);
		printf("SPU cache mem transferred to LS:        %d KB\n",pPerfStats->memTransToLS >> 10);
		printf("SPU cache mem transferred back from LS: %d KB\n",pPerfStats->memTransFromLS >> 10);
		printf("\nSPU driverSize size:      %d bytes\n",pPerfStats->driverSize << 4);
		printf("SPU job size:             %d bytes\n",pPerfStats->jobSize << 4);
		printf("SPU 1st bubble size:      %d bytes\n",pPerfStats->firstBubbleSize << 4);
		printf("SPU job avail.stack size: %d Kbytes\n",pPerfStats->stackSize);
		printf("SPU job cache size:       %d Kbytes\n",pPerfStats->cacheSize);
		printf("\nSPU allocations (non bucket system):     %d\n", pPerfStats->allocsNoBucket);
		printf("SPU memory releases (non bucket system): %d\n", pPerfStats->freeCountNoBucket);
		printf("SPU allocations (bucket system):         %d\n", pPerfStats->allocsBucket);
		printf("SPU memory releases (bucket system):     %d\n", pPerfStats->freeCount);
		printf("SPU memory releases history matches (bucket system): %d\n", pPerfStats->freeCountHistory);
		printf("SPU allocation size:                  %d\n", pPerfStats->allocSize);
		printf("SPU effective bucket allocation size: %d\n", pPerfStats->totalBucketAllocSize);
		printf("\n-------------------End JobStatistics-------------------\n\n");
	}
#endif
}

void CJobManSPU::HandleSpuInterrupt(uint64_t spuId)
{
	const sys_raw_spu_t cId = spuId;

#if defined(DO_SPU_PROFILING)
	static uint32 sProfFileCntr = 0;
#endif

#if defined(SUPP_SN)
	snRawSPULockHandler();
#endif
	uint64_t stat;
	uint32 mail;
	int resetRet;
	int ret = sys_raw_spu_get_int_stat(cId, 2, &stat);	//create a tag to handle class 2 interrupt, because PPU Interrupt MB
	if(CELL_OK != ret)
	{
//		if(GetILog())
//			GetILog()->LogError("sys_raw_spu_get_int_stat is failed %d\n", ret);
		printf("sys_raw_spu_get_int_stat is failed %d\n", ret);
#if defined(SUPP_SN)
		snRawSPUUnlockHandler();
#endif
		sys_interrupt_thread_eoi();
	}
	//if the caught class 2 interrupt includes mailbox interrupt, handle it
	if((stat & scPCIntStatMailbox) == scPCIntStatMailbox)
	{
		ret = sys_raw_spu_read_puint_mb(cId, &mail);
		if(CELL_OK != ret)
		{
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_read_puint_mb is failed %d\n", ret);
			printf("sys_raw_spu_read_puint_mb is failed %d\n", ret);
			sys_interrupt_thread_eoi();
		}
		const cEventID = (mail >> EVENT_PORT_SHIFT);
		switch(cEventID)
		{
		case EVENT_PRINTF_PORT:
			{	
				//spu_sprintf request, address is stored in the non interrupt mailbox
				char buffer[1024];
				uint32 lsAddr = ReadSPUProbReg(cId, scPCPPUMB);
				//get format string and variable values from local store and to print on a console
				spu_raw_sprintf(buffer, GetSPULSBaseAddr(cId), lsAddr);
				//check if it was an failed assertion, exit due to halted SPU
				static char bufferP[1024];
				sprintf(bufferP, "SPU%d: %s", cId, buffer);
				const int cSysRet = printf(bufferP);
//				if(GetILog())
//					GetILog()->Log(bufferP);
				//reset the PU interrupt mailbox interrupt status bit
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				WriteSPUProbReg(cId, scPCSPUMB, cSysRet);//SPUs printf is expecting a return value
#if !defined(SUPP_SN)
				if(memcmp(SPU_ASSERT_STRING, buffer, strlen(SPU_ASSERT_STRING)) == 0)
					abort();
#endif
				break;
			}
			case EVENT_PRINTF_PORT_CUSTOM:
			{
				//reset the PU interrupt mailbox interrupt status bit
				CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
				const char* cpBuf = rMemAreaMan.HandlePrintfRequest(spuId);
//				if(cpBuf && GetILog())
//					GetILog()->Log("SPU%d: %s",(int)spuId,cpBuf);
				WriteSPUProbReg(cId, scPCSigNotify1, 1);//printf is expecting a value
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				break;
			}
		case EVENT_CALLBACK_PORT:
			{	//callback request, index is stored in the lower 16 bit, also set job state to not running
				const uint8 cCallbackIndex = (mail & 0xFF);
#if defined(SUPP_SN)
				if(cCallbackIndex == NPPU::scDebugCallbackPort)
				{
					//release debugging
					CJobManSPU::Instance()->SetDebuggingActive(false);	//inform job manager
				}
				else
				{
#endif
				static NPPU::SQueueNodeSPU& rJobQueue	= CJobManSPU::Instance()->GetJobQueue();
				NPPU::SJobData& rJobData							= rJobQueue.jobData[cCallbackIndex];
//				while(rJobQueue.jobInfoBlocks[cCallbackIndex].jobState.IsRunning()){}//wait til all data has been transferred (time critical with flushing cache)
				//call callback directly
				assert(rJobData.callbackData.pCallbackFnct);
				rJobData.callbackData.pCallbackFnct(rJobData.callbackData.pArg);
				rJobQueue.jobInfoBlocks[cCallbackIndex].jobState.running = 0;//mark job as finished
#if defined(SUPP_SN)
				}
#endif
				//reset the PU interrupt mailbox interrupt status bit
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				break;
			}
		case EVENT_MEMCHANGE_PORT:
			{
				CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
				rMemAreaMan.HandleMemRequest(spuId);
				//reset the PU interrupt mailbox interrupt status bit
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				break;
			}
#if defined(DO_SPU_PROFILING)
		case EVENT_PROF_PORT:
			{
				const uint8 cJobId = (mail & 0xFF);
				CSPUMemAreaMan& rMemAreaMan = CJobManSPU::Instance()->GetMemAreaMan();
				rMemAreaMan.HandleProfRequest(spuId, sProfFileCntr++, cJobId);
				WriteSPUProbReg(cId, scPCSigNotify1, 1);//printf is expecting a value
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				break;
			}
#endif
		default:
			{
//				if(GetILog())
//					GetILog()->LogError("caught unknown interrupt: %d\n",cEventID);
				printf("caught unknown interrupt: %d\n",cEventID);
				resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatMailbox);
				break;
			}
		}
		if(CELL_OK != resetRet)
		{
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_set_int_stat is failed: %d\n", resetRet);
			printf("sys_raw_spu_set_int_stat is failed: %d\n", resetRet);
#if defined(SUPP_SN)
			snRawSPUUnlockHandler();
#endif
			sys_interrupt_thread_eoi();
		}
	}
	else
#if defined(SUPP_SN)
	if(stat & scPCIntStatStopSignal)	//stop
	{
		//get stop signal
		NPPU::SSpuStatusRegister status;
		status.val = *(volatile uint32_t*)get_reg_addr(cId, scPCSPUStatus);
		switch(status.sc)
		{
		case 0x3:
			*(volatile uint32_t*)get_reg_addr(spuId, scPCSPURunCntl) = 0x1;			//restart the SPU
			break;

		case 254:
			{	//was custom stop instruction with branch register dest set, move PC to branch target
				//28: offset of spu_mod_hdr->pad (first 32 bytes in non PIC SPUDriver image)
				const uint32 cBranchDest = ReadSPULS(spuId, 28);
				WriteSPUProbReg(spuId, scPCSPUNPC, cBranchDest+4);//make PC point to branch target
				snRawSPUNotifySPUStopped(spuId);
				break;
			}

		case 255:
			{	//was custom stop instruction without register, move PC to next instruction
				const uint32 cCurPC = ReadSPUProbReg(spuId, scPCSPUNPC);
				WriteSPUProbReg(spuId, scPCSPUNPC, cCurPC+4);//make PC point to next instruction
				snRawSPUNotifySPUStopped(spuId);
				CJobManSPU::Instance()->SetDebuggingActive(true);	//inform job manager
				break;
			}
		default:
			snRawSPUNotifySPUStopped(spuId);
			break;
		}
		resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatStopSignal);
	}
	else 
	if(stat & scPCIntStatHaltSignal)	//halt
	{
		snRawSPUNotifySPUStopped(spuId);
		resetRet = sys_raw_spu_set_int_stat(cId, 2, stat & scPCIntStatHaltSignal);
	}
	else
#endif
	{
//		if(GetILog())
//			GetILog()->LogError("unknown interrupt stat: %ld\n", (long int)stat);
		printf("Unknown Interrupt. stat: %ld\n", (long int)stat);
		//reset interrupt status bit of those not handled
		ret = sys_raw_spu_set_int_stat(cId, 2, stat);
		if(CELL_OK != ret)
		{
//			if(GetILog())
//				GetILog()->LogError("sys_raw_spu_set_int_stat failed: %d\n", ret);
			printf("sys_raw_spu_set_int_stat failed: %d\n", ret);
#if defined(SUPP_SN)
			snRawSPUUnlockHandler();
#endif
		}
	}
#if defined(SUPP_SN)
	snRawSPUUnlockHandler();
#endif
	sys_interrupt_thread_eoi();
}

const bool CJobManSPU::CreateRawSpuIntrHandler(const sys_raw_spu_t cSPUId, const uint32 cIndex)
{
	//create an interrupt handler and establish it on an interrupt PPU thread 
	//this PPU interrupt thread is going to handle interrupt mailbox events such as SPUs printf
	sys_ppu_thread_t handler;
	int ret = sys_ppu_thread_create(&handler, HandleSpuInterrupt, cSPUId, 2100, scPCPPUStackSize, SYS_PPU_THREAD_CREATE_INTERRUPT, "Interrupt PPU Thread");
	if(CELL_OK != ret)
		printf("sys_ppu_thread_create is failed %d\n", ret);
	ret = sys_raw_spu_create_interrupt_tag(cSPUId, 2, SYS_HW_THREAD_ANY, &m_SpuIntTags[cIndex]);
	if(CELL_OK != ret)
	{
		printf("CJobManSPU::CreateRawSpuIntrHandler: sys_raw_spu_create_intr_tag() failed (returned %d)",ret);
		return false;
	}
	ret = sys_interrupt_thread_establish(&m_SpuIntHandle[cIndex], m_SpuIntTags[cIndex], handler, cSPUId);
	if(CELL_OK != ret)
	{
		printf("CJobManSPU::CreateRawSpuIntrHandler: sys_intr_thread_establish() failed (returned %d)",ret);
		return false;
	}
#if defined(SUPP_SN)
	ret = sys_raw_spu_set_int_mask(cSPUId, 2, 7);
#else
	ret = sys_raw_spu_set_int_mask(cSPUId, 2, 1);
#endif
	if(CELL_OK != ret)	//set interrupt mask, the third argument = 1 enables PPU Mailbox interrupts
	{
		printf("CJobManSPU::CreateRawSpuIntrHandler: raw_spu_set_int_mask() failed: %d", ret);
		return false;
	}
/*	ret = sys_raw_spu_set_int_stat(cSPUId, 2, ~0x0);
	if(CELL_OK != ret)
	{
		printf("CJobManSPU::CreateRawSpuIntrHandler: raw_spu_set_int_stat() failed (returned %d)",ret);
		return false;
	}
*/
	return true;
}

void CJobManSPU::LoadSPULoaderDriver
(
	const uint32 cRealSPUId,
	const uint32 cSPUIndex,
	const bool cIsRecreate
) const
{
#if defined(SUPP_SN)
	//even so the driver is loaded by the SPU itself, we need to fake the load to make the debugger load the symbols
	#if defined(_DEBUG)
		#define SPU_IMAGE "/app_home/SPUDriver_inc_debug.elf"
	#else
		#if defined(DO_SPU_PROFILING)
			#define SPU_IMAGE "/app_home/SPUDriver_inc_profile.elf"
		#else
			#define SPU_IMAGE "/app_home/SPUDriver_inc_release.elf"
		#endif
	#endif
	if(!cIsRecreate)
	{
		uint32_t entry;
		int ret = 0;
		if((ret = sys_raw_spu_load(cSPUIndex, SPU_IMAGE, &entry)) != SUCCEEDED)
			printf("Warning: failed to load SPU Driver \"%s\" for debugging, raw_spu_load returned 0x%08x\n", SPU_IMAGE, ret);
		snRawSPUNotifyElfLoad(cSPUIndex, entry, SPU_IMAGE);
	}
	//parameters that will be used by spu loader to load spu driver
//	scSPULoaderParam.bin.imageSize  = m_DriverSize;
	scSPULoaderParam.bin.imageSize  = 0;
	scSPULoaderParam.bin.imageEAlow = (uint32)m_pElfInfo;
	scSPULoaderParam.bin.elfEAlow		= (uint32)m_pElfInfo;
	scSPULoaderParam.bin.destLS     = 0;
	scSPULoaderParam.bin.entry      = m_pElfInfo->entry;
#else
	scSPULoaderParam.bin.imageSize  = m_pElfInfo->imageSize;
	scSPULoaderParam.bin.imageEAlow = m_pElfInfo->spuImageEA.ui[1];
	scSPULoaderParam.bin.elfEAlow		= m_pElfInfo->spuElfEA.ui[1];
	scSPULoaderParam.bin.destLS     = m_pElfInfo->LSDestination;
	scSPULoaderParam.bin.entry      = m_pElfInfo->entry;
#endif

	scSPULoaderParam.bin.imageEAhi  = 0;
	scSPULoaderParam.bin.lsLimit    = NSPU::scLSSize;
	scSPULoaderParam.bin.elfEAhi		= 0;

	//parameters that will be passed to spu main() function
	scSPULoaderParam.thr.gpr3[0]		= (uint32)&m_SPUJobQueue.pull | m_NumSPUAllowed;
	scSPULoaderParam.thr.gpr3[1]		= (uint32)m_MemAreaMan.GetSPUMemArea(0);//id is used to calc correct offset on SPU
	scSPULoaderParam.thr.gpr3[2]		= (uint32)&m_BubbleInfo;

	scSPULoaderParam.thr.gpr3[3]		= m_DriverSize | (cSPUIndex << 24);
	//copy parameters to LS offset SPU_LDR_PARAMS_start. 
	SendMFCCmd(cRealSPUId, (const uint32)NSPU::scLoaderParamsStart, (const uint32)&scSPULoaderParam, sizeof(scSPULoaderParam), 0, scPCMFCGetCMD);
	//set the SPUs program counter
	WriteSPUProbReg(cRealSPUId, scPCSPUNPC, NSPU::scLoaderProgramStart);
#if defined(SUPP_SN)
	if(!cIsRecreate)
		snRawSPUNotifySPUStarted(cSPUIndex);
#endif
	//copy loader to local store and start SPU execution (s bit set in MFC Get command)
	SendMFCCmd(cRealSPUId, (uint32)NSPU::scLoaderProgramStart, (uint32)&scInitalSPULoader, sizeof(scInitalSPULoader), 0, scPCMFCGetFSCMD);
}

void CJobManSPU::CreateBubbleJobStringTable()
{
	NBubBin::SJobStringHandle *pJobStringHandleRep = (NBubBin::SJobStringHandle*)((uint8*)m_pSPURep + sizeof(NBubBin::SHeader));
	const uint32 cBubRepAddr = (uint32)m_pSPURep;
	m_JobStringOffsets.resize(m_pSPURep->jobNum);
	for(uint32 i=0; i<m_pSPURep->jobNum; ++i)
	{
		SJobStringHandle newJobEntry;
		newJobEntry.cpString	= (const char*)(void*)(pJobStringHandleRep->jobStringStart + cBubRepAddr);
		newJobEntry.strLen		= pJobStringHandleRep->jobStringSize;
		newJobEntry.jobHandle = (uint32)(pJobStringHandleRep->jobEntryOffset + cBubRepAddr);
		newJobEntry.jobId		  = i;
		m_JobStringOffsets[i] = pJobStringHandleRep;
		m_JobStringTable.push_back(newJobEntry);
		++pJobStringHandleRep;
	}
	std::sort(m_JobStringTable.begin(), m_JobStringTable.end());
}

const char* CJobManSPU::GetJobName(const uint32 cId)
{
	static char sJobName[64];
	assert(cId < m_JobStringOffsets.size());
	const NBubBin::SJobStringHandle *const cpJobStringHandleRep = m_JobStringOffsets[cId];
	const uint32 cBubRepAddr = (uint32)m_pSPURep;
	const uint32 cStrLen = cpJobStringHandleRep->jobStringSize;
	assert(cStrLen < sizeof(sJobName)-1);
	memcpy(sJobName, (const char*)(void*)(cpJobStringHandleRep->jobStringStart + cBubRepAddr), cStrLen);
	sJobName[cStrLen] = '\0';
	return sJobName;
}

void CJobManSPU::CreateSPUBubbleDir()
{
	m_pBubbleDir = (NSPU::SBubbleInfo*)::operator new(m_pSPURep->bubbleNum * sizeof(NSPU::SBubbleInfo), 16);//align to 16 byte for DMA
	m_BubbleInfo.bubbleDirEA = (uint32)m_pBubbleDir;
	m_BubbleInfo.bubbleNum = m_pSPURep->bubbleNum;
	//get bubble addresses into the directory
	NSPU::SBubbleInfo *pCurBubDir = m_pBubbleDir;
	uint32 *pCurBubbleDirRep = (uint32*)((uint8*)m_pSPURep + m_pSPURep->bubbleOff);
	const uint32 cBubRepAddr = (uint32)m_pSPURep;
	for(uint32 i=0; i<m_BubbleInfo.bubbleNum; ++i)
	{
		pCurBubDir->ea						= cBubRepAddr + *pCurBubbleDirRep++;
		NBubBin::SBubble* pBubble = (NBubBin::SBubble*)(void*)pCurBubDir->ea;
		pCurBubDir->size					= NBubBin::DecodeBubbleSize(pBubble->size);
		++pCurBubDir;
	}
	m_BubbleInfo.globalVarBaseAddr	= (int32)&CJobManSPU::scInitalSPULoader[0];
	m_BubbleInfo.ppuSyncEA					= (uint32)&m_SPUJobQueue;
}

const TJobHandle CJobManSPU::GetJobHandle(const char* cpJobName, const uint32 cStrLen) const
{
#if defined(USE_JOB_QUEUE_VERIFICATION)
	//create a buffer for the last failed job invocation
	static char sFailedBuf[128];
#endif
	static SJobStringHandle cFailedLookup = {"", 0, INVALID_JOB_HANDLE};
	const SJobStringHandle cLookup = {cpJobName, cStrLen, INVALID_JOB_HANDLE};
	const std::vector<SJobStringHandle>::const_iterator cEnd = m_JobStringTable.end();
	const std::vector<SJobStringHandle>::const_iterator cRes = std::find(m_JobStringTable.begin(), cEnd, cLookup);
	assert(cRes != cEnd);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cRes == cEnd)
	{
		printf("Failed to obtain job handle for job: \"%s\"\n", cpJobName);
		for(std::vector<SJobStringHandle>::const_iterator it = m_JobStringTable.begin(); it != cEnd; ++it)
			printf("  available jobs: \"%s\"\n", it->cpString);
		printf("\n");
		strncpy(sFailedBuf, cpJobName, std::min(cStrLen, (uint32)127));
		cFailedLookup.cpString = sFailedBuf;
		cFailedLookup.strLen = cStrLen;
		return (TJobHandle)&cFailedLookup;
	}
	else
		return (TJobHandle)&(cRes->cpString);
#else
	return (cRes == cEnd)?(TJobHandle)&cFailedLookup : (TJobHandle)&(cRes->cpString);
#endif
}

const bool CJobManSPU::InitSPUs(const char* cpSPURepository)
{
#if defined(SUPP_SN)
	if(snInit() != SUCCEEDED)
	{
		printf("Failed to initialize libsn\n");
		return false;
	}
#endif

	if(m_NumSPUAllowed == 0)
		return false;//something has gone wrong during initialization
	assert(!m_Initialized);

	//load SPU repository, memory has to be 128 byte aligned to keep all structures aligned
	int ret = cellSysmoduleLoadModule(CELL_SYSMODULE_FS);
	if(ret) 
	{
		printf("InitSPUs: cellSysmoduleLoadModule(CELL_SYSMODULE_FS) error 0x%x\n", ret);
		return false;
	}

	int repFile;
	ret = cellFsOpen(cpSPURepository, CELL_FS_O_RDONLY, &repFile, NULL, 0);
	if(ret != CELL_FS_SUCCEEDED)
	{
		printf("InitSPUs: failed to open SPU repository file: %s\n", cpSPURepository);
//		return false;
	}
	else
	{
		CellFsStat repStat;
		if(CELL_FS_SUCCEEDED != cellFsStat(cpSPURepository, &repStat))
		{
			printf("InitSPUs: failed to obtain file stats of SPU repository file: %s\n", cpSPURepository);
			cellFsClose(repFile);
			return false;
		}

		size_t readLen = repStat.st_size;
		if(readLen <= 0)
		{
			printf("InitSPUs: failed to read SPU repository file: %s has length: %d\n", cpSPURepository, readLen);
			cellFsClose(repFile);
			return false;
		}

		m_pSPURep = (NBubBin::SHeader*)::operator new(readLen, 128);//allocate with alignment
		uint64_t readLenRes;
		ret = cellFsRead(repFile, m_pSPURep, readLen, &readLenRes);
		if(ret != CELL_FS_SUCCEEDED || readLen != readLenRes) 
		{
			printf("InitSPUs: failed to read SPU repository file: %s\n", cpSPURepository);
			cellFsClose(repFile);
			return false;
		}
		//build job string table
		CreateBubbleJobStringTable();	
		//build SPU bubble directory (bubble main mem address)
		CreateSPUBubbleDir();

		cellFsClose(repFile);
	}

	sys_spu_initialize(scMaxSPU, m_NumSPUAllowed);

	m_SPUJobQueue.push.lockObtained				= 0;
	m_SPUJobQueue.push.baseAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[0];
	m_SPUJobQueue.push.topAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[scMaxWorkQueueJobs];//pointing right behind it
	m_SPUJobQueue.push.curAddr						= m_SPUJobQueue.push.baseAddr;

	m_SPUJobQueue.pull.lockObtained				= 0;
	m_SPUJobQueue.pull.baseAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[0];
	m_SPUJobQueue.pull.topAddr						= (uint32)&m_SPUJobQueue.jobInfoBlocks[scMaxWorkQueueJobs];
	m_SPUJobQueue.pull.curAddr						= m_SPUJobQueue.pull.baseAddr;
	memset(m_SPUJobQueue.spuPacketSync, 0, SQueueNodeSPU::scSyncCount);
	m_CurSpuPacketSyncIndex = 127;//to be counted down to 1 then starting at 127 again..

	InitBucketMemory();

	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		if(EAGAIN == sys_raw_spu_create((sys_raw_spu_t*)&m_SpuIDs[i], NULL))
		{
			m_NumSPUAllowed = i;
			printf("Available number of SPUs has been reseted to: %d\n",i);
			break;
		}

#if defined(SUPP_SN)
		snRawSPUNotifyCreation(m_SpuIDs[i]);
#endif
		//reset all pending interrupts before starting
//		sys_raw_spu_set_int_stat(m_SpuIDs[i], 2, 0xFUL);
//		sys_raw_spu_set_int_stat(m_SpuIDs[i], 0, 0xFUL);
//		sys_raw_spu_set_spu_cfg(m_SpuIDs[i], 0);//no OR mode

		CreateRawSpuIntrHandler(m_SpuIDs[i], i);

		//run the SPU program, this will look at the job queue and start trying to get work to do
		LoadSPULoaderDriver(m_SpuIDs[i], i);
	}
	TestSPUs();//test if the SPUs have been started
	m_Initialized = true;
	
	printf("JobManager: init %d SPUs(SPU Driver: %.1f KB)\n", m_NumSPUAllowed, (float)m_DriverSize / 1024.f);

	return true;
}

//gets job slot for next job (to get storage index for SJobdata), waits until a job slots becomes available again since data get overwritten
const NPPU::EAddJobRes NPPU::CJobManSPU::GetSPUJobSlot(uint32& __restrict rJobSlot, uint32& __restrict rNextPush)
{
	//do changes in sync with SPU implementation in JobAPI_spu.cpp
	//wait til a job slot becomes available
	uint32 curPush;
	//get current push address according to if it is a depending job or not, replace following code by mask
	//	curPush = (m_RealCurPushAddress == ~0)?m_SPUJobQueue.push.curAddr : m_RealCurPushAddress;//get current push address
	const uint32 cCurPushMask = (uint32)(((int32)(-(m_RealCurPushAddress - ~0))) >> 31);
	curPush = m_RealCurPushAddress & cCurPushMask | m_SPUJobQueue.push.curAddr & ~cCurPushMask;

#if defined(USE_JOB_QUEUE_VERIFICATION)//assume we never hit it, enlarge it
	static const uint32 scMaxWaitLoopCount = 10;//after 10 loops of sleeping, it returns false
#endif

	while(1)//continue til we found an empty job slot
	{
		rNextPush = curPush + NSPU::NDriver::scSizeOfSJobQueueEntry;
		//start at index 0 if we reached the end, replace following code by mask
		//	if(rNextPush == m_SPUJobQueue.push.topAddr)	rNextPush = m_SPUJobQueue.push.baseAddr;
		const uint32 cNextPushMask = (uint32)(((int32)(rNextPush - m_SPUJobQueue.push.topAddr)) >> 31);
		rNextPush = rNextPush & cNextPushMask | m_SPUJobQueue.push.baseAddr & ~cNextPushMask;

#if defined(USE_JOB_QUEUE_VERIFICATION)//assume we never hit it, enlarge it
		uint32 i = 0;//counter for idle loops
		while(rNextPush == m_SPUJobQueue.pull.curAddr)
		{
			sys_timer_usleep(1);//wait 1 microsecond
			++i;
			if(i == scMaxWaitLoopCount)
				return eAJR_EnqueueTimeOut;
		}
#endif
		//now check if job in push job slot has been finished, if not, increment push address and mark as not to be pulled next time
		rJobSlot = (uint32)((curPush - m_SPUJobQueue.push.baseAddr) >> NSPU::NDriver::scSizeOfSJobQueueEntryShift);
		assert(rJobSlot < scMaxWorkQueueJobs);
		NSPU::NDriver::SInfoBlock& __restrict rJobInfoBlock	= m_SPUJobQueue.jobInfoBlocks[rJobSlot];
		if(__builtin_expect(!rJobInfoBlock.jobState.IsRunning(), true))
			return eAJR_Success;
#if defined(USE_JOB_QUEUE_VERIFICATION)
		NPPU::SJobData& __restrict rJobdata									= m_SPUJobQueue.jobData[rJobSlot];
		//job is still in progress, check timeout and get next one
		if(JobTimeOutElapsed(rJobdata.jobStartTime))
			return eAJR_EnqueueTimeOutPushJob;
		else
#endif
		{
			//mark entry as invalid, increment current push and next push address
			//this job remains in the queue
			rJobInfoBlock.SetFetchable(false);//not to be fetched this round (still running)
			curPush += NSPU::NDriver::scSizeOfSJobQueueEntry;
			//start at index 0 if we reached the end, replace following code by mask
			//	if(curPush == m_SPUJobQueue.push.topAddr)	curPush = m_SPUJobQueue.push.baseAddr;
			const uint32 cCurNextPushMask = (uint32)(((int32)(curPush - m_SPUJobQueue.push.topAddr)) >> 31);
			curPush = curPush & cCurNextPushMask | m_SPUJobQueue.push.baseAddr & ~cCurNextPushMask;
		}
	}
	return eAJR_Success;
}

const EAddJobRes CJobManSPU::AddJob
(
	CSPUJobDel& __restrict crJob,
	const uint32 cOpMode,
	const uint8 cMinStackSizeKB,
	const TJobHandle cJobHandle,
	const uint32 cIsDependentJob
)
{
	volatile NSPU::NDriver::SExtJobState *__restrict const pJobState = crJob.m_pJobState;
	__dcbt(pJobState);
	assert(cJobHandle && cJobHandle->jobHandle != INVALID_JOB_HANDLE);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(!cJobHandle || cJobHandle->jobHandle == INVALID_JOB_HANDLE)
		return eAJR_InvalidJobHandle;
#else
	if(cJobHandle->jobHandle == INVALID_JOB_HANDLE)
		return eAJR_InvalidJobHandle;
#endif

	NBubBin::SJob *pJob = (NBubBin::SJob*)((void*)cJobHandle->jobHandle);

	void* __restrict pJobProgramData	= (void*)pJob;
	const uint32 cJobProgramSize			= ((uint32)pJob->totalJobSize << 2);//stored in multiple of 4 bytes
	assert(pJob->initialBubbles[0] >= 0);
	const uint16 cFirstBubIndex				= (uint16)pJob->initialBubbles[0];

	const uint32 cJobId								= cJobHandle->jobId;

#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(!m_Initialized)
		return eAJR_SPUNotInitialized;
#endif

	assert(pJobProgramData);
	uint32 jobSlot;
	uint32 nextPush;
	
	const void *cpQueue		= crJob.GetQueue();
	const bool cNoQueue		= (cpQueue == NULL);

	//setup for all spu packets
	const uint32 cOrigParamSize = crJob.GetParamDataSize();
	const uint8 cParamSize	= cOrigParamSize >> 4;
	const uint16 cJobSize		= cJobProgramSize >> 2;
	assert((cJobProgramSize & 3) == 0);
	const SCallback& __restrict crCallback = crJob.GetCallbackdata();

	if(__builtin_expect(!cIsDependentJob, true))
		Lock();

	const EAddJobRes cEnqRes = GetSPUJobSlot(jobSlot, nextPush);
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cEnqRes != eAJR_Success)
		return cEnqRes;
#endif
	NSPU::NDriver::SInfoBlock& __restrict rInfoBlock	= m_SPUJobQueue.jobInfoBlocks[jobSlot];
	rInfoBlock.Reset(cpQueue, !cIsDependentJob, cMinStackSizeKB);
	rInfoBlock.eaDMAJobAddress					= (uint32)pJobProgramData;
	rInfoBlock.jobSize									= cJobSize;
	rInfoBlock.paramSize								= cParamSize;
	rInfoBlock.SetOpMode(cOpMode);
	rInfoBlock.firstBubbleIndex					= cFirstBubIndex;
	rInfoBlock.jobId										= cJobId;
#if defined(SUPP_SN)
	if((uint32)pJobProgramData == (uint32)m_SPUJobDebugHandle)
	{
		m_SPUJobDebugHandle = (void*)0;//reset
		rInfoBlock.EnableDebug();
		rInfoBlock.EnableDriverDebug(m_SPUDriverDebuggingEnabled);
	}
#endif

#if defined(SUPP_SPU_FRAME_STATS)
	SFrameProfileData* const cpFrameProfileData = crJob.GetFrameProfData();
	//find out if is has been added already this frame
	const uint32 cCurFrameProfDataIndex = m_CurFrameProfDataIndex;
	SFrameProfileData*const __restrict *const __restrict ppFrameProfData = m_SPUJobQueue.frameProfData;
	uint32 i=0;
	for(; i<cCurFrameProfDataIndex; ++i)
	{
		if(ppFrameProfData[i] == cpFrameProfileData)
			break;
	}
	if(i == cCurFrameProfDataIndex)
	{
		++m_CurFrameProfDataIndex;
		assert(m_CurFrameProfDataIndex < MAX_PROFILE_JOBS);
		m_SPUJobQueue.frameProfData[i] = cpFrameProfileData;
	}
	++ppFrameProfData[i]->count;
	rInfoBlock.frameProfIndex = (unsigned char)i;
#endif

	volatile const CCommonDMABase** __restrict ppPackets;
	volatile const CSPUPacketBase** __restrict ppSPUPackets;
	uint32 packetCount = 0, spuPacketCount = 0;
	uint32 extJobAddr = 0;

	//if a producer/consumer queue is used, do neither set parameter addresses nor add packets nor set external job state
	if(cNoQueue)
	{
		crJob.GetAllPackets(packetCount, spuPacketCount, ppPackets, ppSPUPackets);//always valid pointer (static array)
		if(__builtin_expect(spuPacketCount > 0, false))
		{
			rInfoBlock.spuPacketSyncIndex			= m_CurSpuPacketSyncIndex--;
			assert(m_SPUJobQueue.spuPacketSync[rInfoBlock.spuPacketSyncIndex] == 0);//should be already counted down (semaphore like)
			m_SPUJobQueue.spuPacketSync[rInfoBlock.spuPacketSyncIndex] = spuPacketCount;//to be counted down
			//since we have index 0 reserved for locking, we need to decrement for branch free behavior
			//replace following code by mask:
			//	if(m_CurSpuPacketSyncIndex == 0)
			//		m_CurSpuPacketSyncIndex = NPPU::SQueueNodeSPU::scSyncMaxIndex;
			const uint32 cSyncMask = (uint32)(((int32)(-m_CurSpuPacketSyncIndex)) >> 31);
			m_CurSpuPacketSyncIndex = NPPU::SQueueNodeSPU::scSyncMaxIndex & ~cSyncMask | m_CurSpuPacketSyncIndex & cSyncMask;
		}

		//get DMA transfer entries from job base
		if(__builtin_expect(packetCount == 0, true))
			CreateDMAListSingle(cOrigParamSize, rInfoBlock.GetParamAddress(), crJob);
		else
			CreateDMAList(cOrigParamSize, rInfoBlock,	crJob, ppPackets, packetCount);

		if(__builtin_expect(pJobState != NULL, true))
		{
			extJobAddr = (uint32)pJobState;
			rInfoBlock.SetExtJobStateAddress(extJobAddr);
			pJobState->running = 1;//set running
		}
	}//cHasQueue

#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(cJobProgramSize >= GetAvailableSPUSize())
		return eAJR_JobTooLarge;
#endif
	//register callback, use if to avoid cache miss otherwise
	void * __restrict pJobData = NULL;
	if(__builtin_expect(crCallback.pCallbackFnct != 0, false))
	{
		NPPU::SJobData& __restrict rJobdata = m_SPUJobQueue.jobData[jobSlot];
		rJobdata.callbackData			= crCallback;
		rInfoBlock.callbackIndex	= jobSlot;
		pJobData = &rJobdata;
	}	
	assert(jobSlot < 256);

#if defined(DO_SPU_PROFILING)
	rInfoBlock.eaJobPerfAddress = (uint32)(void*)crJob.m_pJobPerfData;
	rInfoBlock.SetTransferProfDataBack(m_pProfStatControl && *m_pProfStatControl != 0);
#endif

	//if it is a depending job, mark it as not fetchable
	assert(cIsDependentJob <= 0x1);

	const CSPUJobDel::TDepJob* __restrict ppDepJobs = NULL;
	const uint32 cDepJobCount = crJob.GetDependentJobs(ppDepJobs);

	//for a dependent job, set address to pEntry into the SJobInfo structure
	m_RealCurPushAddress = nextPush;//set next push address, real push pointer visible to SPU is not yet updated
	if(__builtin_expect(cIsDependentJob, false))
	{
		assert(m_pLastAddedInfoBlock);
		//do not update push pointer yet since the address to the dependent jobs has to get set up first
		//set depending job address
		m_pLastAddedInfoBlock->SetDependentJobIndex(jobSlot);
	}
	else
	{
		//replace following code by mask:
		//	if(cDepJobCount == 0)//was no dependent jobs and has none
		//	{
		//		m_SPUJobQueue.push.curAddr = nextPush;//entry usage is safe til we increment push address so that pull address can fetch last set entry
		//		m_RealCurPushAddress = ~0;//reset
		//	}
		const uint32 cDepJobCountMask = (uint32)(((int32)(-cDepJobCount)) >> 31);
		rInfoBlock.jobState.running		= (uint32)(cDepJobCount == 0);//job is available for SPUs
		m_SPUJobQueue.push.curAddr		= nextPush & ~cDepJobCountMask | m_SPUJobQueue.push.curAddr & cDepJobCountMask;
		m_RealCurPushAddress					= ~0 & ~cDepJobCountMask | m_RealCurPushAddress & cDepJobCountMask;
	}

	if(__builtin_expect(cDepJobCount > 0, false))
	{
		//update pointer to most recently added depending job
		m_pLastAddedInfoBlock = &rInfoBlock;
		for(uint32 i=0; i<cDepJobCount; ++i)
		{
			CSPUJobBase& __restrict pJobBase = (CSPUJobBase&)*ppDepJobs[i];
			pJobBase.m_JobDelegator.RunJob(pJobBase.GetOpMode(), pJobBase.GetMinStackSizeKB(), pJobBase.GetJobProgramData(), 1/*dep job*/);
		}
		if(!cIsDependentJob)
		{
			//update push address for subsequent calls if it is no dependent job but has some
			m_SPUJobQueue.push.curAddr = m_RealCurPushAddress;
			m_RealCurPushAddress = ~0;//reset
		}
		rInfoBlock.jobState.running = 1;//job is already available for SPUs
	}
	const uint32 *const pInfoBlockRemaining	= (uint32*)&rInfoBlock.jobId;
	//now set up the spu packet jobs, first one has been made available already
	//dependent jobs can only be attached to the main job
	for(uint32 i=0; i<spuPacketCount; ++i)
	{
		const CSPUPacketBase& __restrict crSPUPacket = (const CSPUPacketBase&)*ppSPUPackets[i];

		const EAddJobRes cEnqRes = GetSPUJobSlot(jobSlot, nextPush);
	#if defined(USE_JOB_QUEUE_VERIFICATION)
		if(cEnqRes != eAJR_Success)
			return cEnqRes;
	#endif
		NSPU::NDriver::SInfoBlock& __restrict rAddInfoBlock	= m_SPUJobQueue.jobInfoBlocks[jobSlot];
		rAddInfoBlock.eaDMAJobAddress		= (uint32)pJobProgramData;
//		rAddInfoBlock.flags2						= rInfoBlock.flags2;
		rAddInfoBlock.SetExtJobStateAddress(extJobAddr);
#if defined(DO_SPU_PROFILING)
		rAddInfoBlock.eaJobPerfAddress = 0;//only available for main job
		rAddInfoBlock.SetTransferProfDataBack(false);//will only be set for 1 SPU per job
#endif
		//perform fast copy op
		uint32 *pAddInfoBlockRemaining	= (uint32*)&rAddInfoBlock.jobId;
		pAddInfoBlockRemaining[0]				= pInfoBlockRemaining[0];
		pAddInfoBlockRemaining[1]				= pInfoBlockRemaining[1];
		pAddInfoBlockRemaining[2]				= pInfoBlockRemaining[2];

		rAddInfoBlock.depJobIndex				= NSPU::NDriver::SInfoBlock::scNoIndex;
		//apply callback, since only one raises it, copy from main job
		if(pJobData)
		{
			NPPU::SJobData& __restrict rJobdata = *(NPPU::SJobData*)pJobData;
			NPPU::SJobData& __restrict rAddJobdata = m_SPUJobQueue.jobData[jobSlot];
			rAddJobdata.callbackData						= rJobdata.callbackData;
		}
		volatile const CCommonDMABase** __restrict ppAddPackets;
		uint32 addPacketCount;
		crSPUPacket.GetPackets(addPacketCount, ppAddPackets);//always valid pointer (static array)
		const CSPUJobDel& __restrict crJobDel = crSPUPacket.m_JobDelegator;
		if(packetCount == 0)
			CreateDMAListSingle(cOrigParamSize, rAddInfoBlock.GetParamAddress(), crJobDel);
		else
			CreateDMAList(cOrigParamSize, rAddInfoBlock, crJobDel, ppAddPackets, addPacketCount);

	#if defined(USE_JOB_QUEUE_VERIFICATION)
		if(cJobProgramSize >= GetAvailableSPUSize())
			return eAJR_JobTooLarge;
	#endif

		//mark as fetchable
		rAddInfoBlock.SetFetchable(true);

		//set next push address, real push pointer visible to SPU is not yet updated
		//replace following code by mask:
		//	if(cIsDependentJob)	m_RealCurPushAddress = nextPush;
		//	else m_SPUJobQueue.push.curAddr = nextPush;
		const uint32 cDepJobMask		= (uint32)(((int32)(-cIsDependentJob)) >> 31);
		m_RealCurPushAddress				= nextPush & cDepJobMask | m_RealCurPushAddress & ~cDepJobMask;
		m_SPUJobQueue.push.curAddr	= nextPush & ~cDepJobMask | m_SPUJobQueue.push.curAddr & cDepJobMask;
		rAddInfoBlock.jobState.running = 1;
	}//SPU packet loop

	if(__builtin_expect(!cIsDependentJob, true))
		UnLock();

#if defined(USE_JOB_QUEUE_VERIFICATION)
	{
		NPPU::SJobData& __restrict rJobdata = m_SPUJobQueue.jobData[jobSlot];
		GetTimeTB(rJobdata.jobStartTime);
	}
#endif

	return eAJR_Success;
}

void CJobManSPU::TestSPUs()
{
	if(IsDebuggingActive())
		return;
#if defined(USE_JOB_QUEUE_VERIFICATION)
	bool detectedFailure = false;
#endif
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		const uint32 cStatus = sys_raw_spu_mmio_read(m_SpuIDs[i], scPCSPUStatus);
		if((cStatus & 0xFFFF0000) == 0x3FFF0000/*executed STOPD instr*/ )
		{
			if(m_pLog)
				m_pLog->LogError("SPU id=%d has executed STOPD instruction\n",i);
			printf("SPU id=%d has executed STOPD instruction\n",i);
#if defined(USE_JOB_QUEUE_VERIFICATION)
			detectedFailure = true;
#endif
		}
		if(!(sys_raw_spu_mmio_read(m_SpuIDs[i], SPU_Status) & 0x1))
		{
			if(m_pLog)
				m_pLog->LogError("SPU id=%d has been halted\n",i);
			printf("SPU id=%d has been halted\n",i);
#if defined(USE_JOB_QUEUE_VERIFICATION)
			detectedFailure = true;
#endif
		}
		if(!(sys_raw_spu_mmio_read(m_SpuIDs[i], SPU_RunCntl) & 0x1))
		{
			if(m_pLog)
				m_pLog->LogError("SPU id=%d has run control 0\n",i);
			printf("SPU id=%d has run control 0\n",i);
#if defined(USE_JOB_QUEUE_VERIFICATION)
			detectedFailure = true;
#endif
		}
	}
#if defined(USE_JOB_QUEUE_VERIFICATION)
	if(detectedFailure)
	{
		//restart all SPUs, clear command queue
		printf("JobManager: resetting job queue and restarting all SPUs\n");
		Lock();
		SetDebuggingActive(false);
		//reset info blocks
		memset(m_SPUJobQueue.jobInfoBlocks, 0, sizeof(m_SPUJobQueue.jobInfoBlocks));
		//reset job queue
		m_SPUJobQueue.push.lockObtained				= 0;
		m_SPUJobQueue.push.curAddr						= m_SPUJobQueue.push.baseAddr;
		m_SPUJobQueue.pull.lockObtained				= 0;
		m_SPUJobQueue.pull.curAddr						= m_SPUJobQueue.pull.baseAddr;
		memset(m_SPUJobQueue.spuPacketSync, 0, SQueueNodeSPU::scSyncCount);
		m_CurSpuPacketSyncIndex = 127;//to be counted down to 1 then starting at 127 again..
		for(uint32 i=0; i<m_NumSPUAllowed; ++i)
		{
			if(SUCCEEDED != sys_raw_spu_destroy(m_SpuIDs[i]))
			{
				if(m_pLog)
					m_pLog->LogError("TestSPUs: failed to destroy SPU id: %d\n",m_SpuIDs[i]);
				printf("TestSPUs: failed to destroy SPU id: %d\n",m_SpuIDs[i]);
			}
			if(SUCCEEDED != sys_raw_spu_create((sys_raw_spu_t*)&m_SpuIDs[i], NULL))
			{
				if(m_pLog)
					m_pLog->LogError("TestSPUs: failed to recreate SPU id: %d\n",m_SpuIDs[i]);
				printf("TestSPUs: failed to recreate SPU id: %d\n",m_SpuIDs[i]);
			}
			LoadSPULoaderDriver(m_SpuIDs[i], i);
		}
		UnLock();
	}
#endif
}

const bool CJobManSPU::SPUJobsActive() const
{
	//compare pull and push pointer and if they are equal, check state of pull-1 info block jobState
	int32 jobSlot = (((int32)m_SPUJobQueue.push.curAddr - (int32)m_SPUJobQueue.push.baseAddr) >> NSPU::NDriver::scSizeOfSJobQueueEntryShift);
	const uint32 cJobSlotMask = (uint32)((int32)(-(jobSlot < 0)) >> 31);
	jobSlot = (int32)((scMaxWorkQueueJobs - 1) & ~cJobSlotMask | jobSlot & cJobSlotMask);
	return (m_SPUJobQueue.push.curAddr != m_SPUJobQueue.pull.curAddr)
		|| (m_SPUJobQueue.jobInfoBlocks[jobSlot].jobState.IsRunning());
}

void CJobManSPU::InitBucketMemory()
{
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;

	//number of memory blocks per bucket size
	//to make header and list be part of the same cacheline, make sure the aggregated count is below 128
	//setup results in 249 KB plus 2.6 KB for the headers plus 2KB for release tracking per SPU 
	//memory is allocated in one block to apply the async write back functionality
	static const uint8 scBucketNumTable[NSPU::SBucketInfo::scBucketCount] = 
	{
		128 - SBucketHeader::scBucketHeaderSize,//32 byte buckets
		128 - SBucketHeader::scBucketHeaderSize,//64 byte buckets
		128 - SBucketHeader::scBucketHeaderSize,//128 byte buckets
		64	- SBucketHeader::scBucketHeaderSize,//256 byte buckets
		64	- SBucketHeader::scBucketHeaderSize,//512 byte buckets
		64	- SBucketHeader::scBucketHeaderSize,//1024 byte buckets
		32	- SBucketHeader::scBucketHeaderSize,//2048 byte buckets
		16	- SBucketHeader::scBucketHeaderSize	//4096 byte buckets
	};

	//first count the totally required memory per SPU and allocate in one chunk
	uint32 totalSize = 0;
	uint32 bucketSize = NSPU::SBucketInfo::scBucketSizeMin;
	for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
	{	
		const uint32 cBucketHeaderSize = SBucketHeader::scBucketHeaderSize + scBucketNumTable[j] + scBucketNumTable[j] * sizeof(uint32);
		totalSize += (cBucketHeaderSize + 127) & ~127;//must be 128 byte aligned
		bucketSize <<= 1;			//always power of 2
	}

	totalSize += sizeof(uint32)*SBucketInfo::scFreedMaxCount;//memory used for memory release tracking
	totalSize += 128;//alignment pad

	//for each SPU initialize and allocate all buckets
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		uint8 *pMem = (uint8*)(((uint32)(new uint8[totalSize]) + 127) & ~127);//align manually, not subject for any free
		SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(i))->bucketInfo;
		NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
		uint32 bucketSize = NSPU::SBucketInfo::scBucketSizeMin;

		rBucketInfo.freedCount = 0;
		uint8 *pCurMem = pMem;

		for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
		{	
			//allocate entire bucket header at once
			pMem = (uint8*)(((uint32)pCurMem + 127) & ~127);//each header must be aligned
			const uint8 cNumBlocks = scBucketNumTable[j];	
			const uint32 cBucketHeaderSize = 
				SBucketHeader::scBucketHeaderSize + //header
				cNumBlocks +						//single linked block list
				cNumBlocks * sizeof(uint32);//block directory
			SBucketHeader* pBuckHeader = (SBucketHeader*)pMem;

			//set up bucket size and pointer
			pBuckets[j].pBucketHeader = pBuckHeader;
			pBuckets[j].available			= cNumBlocks;
			pBuckets[j].size					= bucketSize;
			pBuckets[j].numTotal			= cNumBlocks;
			//set up bucket header
			pBuckHeader->listIndex		= 0;//make index 0 the first available bucket
			pBuckHeader->listIndexEnd = cNumBlocks-1;//last available index, required to append new available ones after reload
			pBuckHeader->dirIndex			= BUCKET_NULL;//no applied slots initially
			pBuckHeader->dirIndexEnd	= BUCKET_NULL;//no applied slots initially
			uint8 *pLinkedList = (uint8*)pBuckHeader + SBucketHeader::scBucketHeaderSize;
			//set up linked list, always pointing to next one
			for(uint32 b=0; b<cNumBlocks-1; ++b)
				pLinkedList[b] = (uint8)(b+1);
			pLinkedList[cNumBlocks-1] = BUCKET_NULL;
			assert(sizeof(SBucketDir) == sizeof(uint32));
			SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckHeader + SBucketHeader::scBucketHeaderSize + cNumBlocks);
			//set up directory by allocating the individual buckets, always 16 byte aligned
			for(uint32 b=0; b<cNumBlocks; ++b)
				pDir[b].address	= (uint32)(::operator new(bucketSize, std::min((uint32)128, bucketSize)));//align to bucketSize (128 max)
			pCurMem = (uint8*)(&pDir[cNumBlocks]);//free info is right behind last bucket directory

			bucketSize <<= 1;//always power of 2
		}

		rBucketInfo.pFreedList = (uint32*)pCurMem;
		memset(rBucketInfo.pFreedList, 0, sizeof(uint32)*SBucketInfo::scFreedMaxCount);
	}
}

void CJobManSPU::UpdateSPUMemMan()
{
	if(IsDebuggingActive())
		return;
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;
	//stall til all SPU jobs have been finished (important to have all caches flushed)
#if defined(_DEBUG)
	bool issueErrorMessage = true;
#endif
#if defined(USE_JOB_QUEUE_VERIFICATION)
	volatile uint32 loops = 0;
#endif
	while(SPUJobsActive())
	{
#if defined(_DEBUG)
		if(issueErrorMessage)
		{
			if(m_pLog)
				m_pLog->LogWarning("Warning: called UpdateSPUMemMan whilst having SPU jobs running\n");
			else
				printf("Warning: called UpdateSPUMemMan whilst having SPU jobs running\n");
			issueErrorMessage = false;
		}
#endif
#if defined(USE_JOB_QUEUE_VERIFICATION)
		if(++loops > MAX_ITER)
		{
			if(m_pLog)
				m_pLog->LogError("Timeout in UpdateSPUMemMan (waiting for all job to be finished)\n");
			else
				printf("\nTimeout in WaitSPUJob (waiting for all job to be finished)\n");
#if defined(SUPP_SN)
			printf("Stopping all SPUs\n");
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
			}
			while(SPUJobsActive()){}
			printf("Continuing all SPUs\n");
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 1);
				snRawSPUNotifySPUStarted(i);
			}
#endif
			return;
		}
#endif
		for(uint32 i=0; i<16; ++i)
			asm volatile("nop");
	}
	//release memory specified by SPUs and refill buckets
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(i))->bucketInfo;
		const uint32 cFreeCount = rBucketInfo.freedCount;
		if(cFreeCount == 0)
			continue;//nothing to do here for this SPU
		//delete from SPU released memory
		//since we do not know the bucket it comes from (would require support from memory manager)
		//	we can just delete the buckets and refill the missing ones
		for(uint32 f=0; f<cFreeCount; ++f)
			::operator delete((void*)(rBucketInfo.pFreedList[f]));
		rBucketInfo.freedCount = 0;
		//refill buckets
		NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
		for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
		{	
			SBucket& rBucket = pBuckets[j];
			if(rBucket.available == rBucket.numTotal)
				continue;//nothing to do for this bucket
			SBucketHeader rBuckHeader = *rBucket.pBucketHeader;
			SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckets[j].pBucketHeader + 
				SBucketHeader::scBucketHeaderSize + rBucket.numTotal);
			uint8 *pLinkedList = (uint8*)rBucket.pBucketHeader + SBucketHeader::scBucketHeaderSize;
			rBucket.available = rBucket.numTotal;//reset count
			//go through available free list stored through directory entries and link new entries to end of linked list
			uint8 curIndex = rBuckHeader.dirIndex;//first index
			const uint8 cFirstRefilledIndex = curIndex;//save first index for post loop
			const uint8 cOldFirstIndex = rBuckHeader.listIndex;//save due to aliasing
			uint8 lastCurIndex = rBuckHeader.listIndexEnd;//if nothing is to refill, it remains the same
			//we have link the curIndex to the rBuckHeader.listIndexEnd
			//	if it was zero before, link to dummy for the first time
			uint8 *pLinkEntry = &pLinkedList[rBuckHeader.listIndexEnd];
			uint8 dummy;
			//	apply branch free: if(rBuckHeader.listIndexEnd == BUCKET_NULL) pLinkEntry = &dummy;
			const uint32 cLinkEntryMask = (uint32)(((int32)(rBuckHeader.listIndexEnd - BUCKET_NULL)) >> 31);
			pLinkEntry = (uint8*)(((uint32)&dummy) & ~cLinkEntryMask | ((uint32)pLinkEntry & cLinkEntryMask));
			while(curIndex != BUCKET_NULL)
			{
				SBucketDir& rCurDirEntry = pDir[curIndex];//get directory entry
				lastCurIndex = curIndex;//store last curIndex to apply to rBuckHeader.listIndexEnd in post loop
				curIndex = rCurDirEntry.GetLinkIndex();//get next index from directory free list
				//alloc new block
				rCurDirEntry.address = (uint32)(::operator new(rBucket.size, std::min((uint32)128, rBucket.size)));
				*pLinkEntry = curIndex;//link to current block
				pLinkEntry  = &pLinkedList[curIndex];//for next loop linkage
			}
			rBuckHeader.listIndexEnd = lastCurIndex;//mark current index as last index
			//if we had allocated all blocks, reset first index, apply branch free:
			//	if(rBuckHeader.listIndex == BUCKET_NULL) rBuckHeader.listIndex = cFirstRefilledIndex
			const uint32 cListIndexMask = (uint32)(((int32)(cOldFirstIndex - BUCKET_NULL)) >> 31);
			rBuckHeader.listIndex = cFirstRefilledIndex & ~cListIndexMask | cOldFirstIndex & cListIndexMask;

			//reset header
			rBuckHeader.dirIndex		= BUCKET_NULL;
			rBuckHeader.dirIndexEnd	= BUCKET_NULL;
		}
	}
}
	
void CJobManSPU::ShutDown()
{
	using NSPU::SBucketHeader;
	using NSPU::SBucket;
	using NSPU::SBucketInfo;
	using NSPU::SBucketDir;

	if(m_pLog)
		m_pLog->Log("Shutting down SPUs...");
	else
		printf("Shutting down SPUs...");
	UpdateSPUMemMan();

	//free all allocated memory
	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		SBucketInfo& rBucketInfo = ((SPPUMemRequestData*)m_MemAreaMan.GetSPUMemArea(i))->bucketInfo;
		NSPU::SBucket *pBuckets = &rBucketInfo.bucketHeaders[0];
		for(uint32 j=0; j<NSPU::SBucketInfo::scBucketCount; ++j)
		{	
			const SBucketHeader& crBuckHeader = *pBuckets[j].pBucketHeader;
			SBucketDir *pDir = (SBucketDir*)((uint8*)pBuckets[j].pBucketHeader + 
				SBucketHeader::scBucketHeaderSize + pBuckets[j].numTotal);

			for(uint32 b=0; b<pBuckets[j].numTotal; ++b)
				if(!pDir[b].IsUnused())
					::operator delete ((uint8*)(pDir[b].address)/*, std::min((uint32)128, pBuckets[j].size)*/);
		}
	}

	if(m_pSPURep)
		delete [] (uint8*)m_pSPURep;
	if(m_pBubbleDir)
		delete [] m_pBubbleDir;

	for(uint32 i=0; i<m_NumSPUAllowed; ++i)
	{
		if(SUCCEEDED != sys_interrupt_thread_disestablish(m_SpuIntHandle[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to disestablish interrupt thread for SPU id: %d\n",m_SpuIDs[i]);
			printf("Failed to disestablish interrupt thread for SPU id: %d\n",m_SpuIDs[i]);
		}
		if(SUCCEEDED != sys_interrupt_tag_destroy(m_SpuIntTags[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to destroy interrupt tag for SPU id: %d\n",m_SpuIDs[i]);
			printf("Failed to destroy interrupt tag for SPU id: %d\n",m_SpuIDs[i]);
		}
		if(SUCCEEDED != sys_raw_spu_destroy(m_SpuIDs[i]))
		{
			if(m_pLog)
				m_pLog->LogError("Failed to destroy SPU id: %d\n",m_SpuIDs[i]);
			printf("Failed to destroy SPU id: %d\n",m_SpuIDs[i]);
		}
#if defined(SUPP_SN)
		snRawSPUNotifyDestruction(m_SpuIDs[i]);
#endif
	}
	printf("done\n");
};

//obtains and resets the SPU stats of the last frame
void CJobManSPU::GetAndResetSPUFrameStats(NPPU::SSPUFrameStats& rStats)
{
#if defined(SUPP_SPU_FRAME_STATS)
	m_MemAreaMan.GetSPUFrameStats(rStats);
	m_MemAreaMan.ResetStats(m_SPUJobQueue.pull.lockObtained-1);	
#endif
}

void CJobManSPU::GetAndResetSPUFrameStats(SSPUFrameStats& rStats, std::vector<SFrameProfileData>& rCurFrameProfVec)
{
#if defined(SUPP_SPU_FRAME_STATS)
	m_MemAreaMan.GetSPUFrameStats(rStats);
	m_MemAreaMan.ResetStats(m_SPUJobQueue.pull.lockObtained-1);	
	rCurFrameProfVec.resize(m_CurFrameProfDataIndex);
	//copy all elements and reset
	SFrameProfileData** ppFrameProfData = m_SPUJobQueue.frameProfData;
	for(uint32 i=0; i<m_CurFrameProfDataIndex; ++i)
	{
		rCurFrameProfVec[i] = *ppFrameProfData[i];
		ppFrameProfData[i]->Reset();
	}
	m_CurFrameProfDataIndex = 0;
#endif//SUPP_SPU_FRAME_STATS
}

void CJobManSPU::EnableSPUDriverDebugging(const bool cEnable)
{
#if defined(SUPP_SN)
	m_SPUDriverDebuggingEnabled = cEnable;
#endif
}

void CJobManSPU::EnableSPUJobDebugging(void* cJobHandle)
{
#if defined(SUPP_SN)
	m_SPUJobDebugHandle = cJobHandle?(void*)((NPPU::TJobHandle)cJobHandle)->jobHandle : NULL;
#endif
}

#undef MAX_ITER

#endif //PS3
