/* 
	definitions for spu job manager
	singleton implementation

	- it manages the job queue hierarchy, one node queue per individual SPU
	- to each SPU the initial loader is uploaded and the spu driver with the PPU push and pull addresses
	- each job has an ID
	- each job has an automatically generated Execute - function taking the parameter address as input
			this function is automatically generated and takes care of calling the actual job entry function
	- memory areas must be specified to be mapped to SPU
		- it can be issued in any order, on the first run it will get sorted, will be saved into template to be reused for faster processing each time
		- allocations are tracked as well, so a delete and (re)alloc can be performed on SPU and will get mirrored on PPU too
		- static class variables can be handled too
		- null can be specified as size too, this means allocation has to be performed on PPU first, SPU has to wait til it is issued
*/

#ifndef __JOBMAN_SPU_H
#define __JOBMAN_SPU_H
#pragma once

#if defined(PS3)

#include <platform.h>
#include <MultiThread.h>
#include <sys/raw_spu.h>
#include <IJobManSPU.h>
#include "../SPU/CodePage/SPUPages.h"
#include "SPU/SPULoaderDump.h"
#include "SPU/SPULoaderDefs.h"
#include "SPUMemAreaMan.h"
#include "PPU.h"
#ifndef JOB_LIB_COMP
	#include "SPUJobBase.h"
#endif
#include "../SPU/Cache/CacheDefs_spu.h"
#include "../SPU/LibSN_Module.h"
#include <cell/spurs.h>
#include <cell/spurs/event_flag.h>
#include <cell/gcm.h>
#include <lib/libsn.h>
#include <map>

namespace NSPU
{
	namespace NElf
	{
		struct SElfInfo;//forward declaration
	}
}

namespace NPageBin
{
	struct SHeader;
	struct SJobStringHandle;
}

namespace NPPU
{
	typedef std::vector<NPageBin::SJobStringHandle*> TJobStringVec;

	int PrintOut( const char *fmt, ... );

	//
	// singleton managing the job queues and/for the SPUs
	class CJobManSPU : public IJobManSPU
	{
	public:
		// singleton stuff
		static CJobManSPU *Instance();

		//destructor
		virtual ~CJobManSPU()
		{}

		//returns number of SPUs allowed for job scheduling
		virtual const unsigned int GetSPUsAllowed() const{return m_NumSPUAllowed;}

		//sets number of SPUs allowed for job scheduling (must be called before spu initialization)
		virtual void SetSPUsAllowed(const unsigned int cNum)
		{
			if(!m_Initialized)
			{
				assert(cNum > 0 && cNum <= scMaxSPU); 
				m_NumSPUAllowed = cNum;
			}
		}

		//returns spu driver size, all data must be placed behind it
		virtual const unsigned int GetDriverSize() const {return m_DriverSize;}

		//initializes all allowed SPUs
		virtual const bool InitSPUs
		(
			TSPUFreeFunc FreeFunc = &free, 
			TSPUMallocFunc MallocFunc = &memalign,
			const int cSPUThreadCnt = 0, 
			const int cSPURSCnt = 1,
			bool bEnablePrintf = true
		);

		//polls for a spu job (do not use is a callback has been registered)
		virtual const bool WaitSPUJob(volatile NSPU::NDriver::SExtJobState& rJobState, const int cTimeOutMS=-1) const;

		//print performance stats
		virtual void PrintPerfStats(const volatile NSPU::NDriver::SJobPerfStats* pPerfStats, const char* cpJobName) const;

		//sets the external log
		virtual void SetLog(ILog *pLog);

		//returns true if SPU jobs are still active
		virtual const bool SPUJobsActive() const;

		virtual void ShutDown();

		//tests all acquired SPUs if they are running, restarts if any failure
		virtual void TestSPUs();

		//tests all acquired SPUs if they are running, reports -1 if something went wrong
		virtual int VerifySPUs(const bool cIgnoreDebugState = false) const;

		//clean released memory form SPUs and refill buckets
		virtual void UpdateSPUMemMan();

		//enables spu driver debugging
		virtual void EnableSPUDriverDebugging(const bool cEnable);

		//enables spu debugging for a particular job
		virtual void EnableSPUJobDebugging(void* cJobHandle);

		//registers a variable to check if the profiling data should be transferred back
		virtual void RegisterProfileStatVar(int* pVar)
		{
#if defined(DO_SPU_PROFILING)
			m_pProfStatControl = pVar;
#endif
		}

		//retrieves the name of a job
		const char* GetJobName(const uint32 cId);

		//obtains and resets the SPU stats of the last frame
		virtual void GetAndResetSPUFrameStats(SSPUFrameStats& rStats, const bool cReset=true);
		virtual void GetAndResetSPUFrameStats(SSPUFrameStats& rStats, const SFrameProfileData*& rpCurFrameProfVec, uint32& rCount);

		virtual void GetAndResetSPUFuncProfStats(const SFrameProfileData*& rpCurFuncProfStatVec, uint32& rCount, const uint32 cThresholdUSecs = 100);

		//adds a job
		virtual const EAddJobRes AddJob
		(
			CSPUJobDel& __restrict crJob,
			const uint32 cOpMode,
			const uint8 cMinStackSizeKB,
			const TJobHandle cJobHandle
		);

		//obtain job handle from name
		virtual const TJobHandle GetJobHandle(const char* cpJobName, const uint32 cStrLen) const;
		virtual const TJobHandle GetJobHandle(const char* cpJobName) const
		{
			return GetJobHandle(cpJobName, strlen(cpJobName));
		}

		//retrieve initialized spurs memory
		virtual CellSpurs* GetSPURS() {return &m_SPURS;}

		virtual void* Allocate(uint32 size, uint32 alignment = 8)
		{
			return m_MallocFunc(size, alignment);
		}

		virtual void Free(void* p)
		{
			m_FreeFunc(p);
		}

		ILog* GetLog() const
		{
			return m_pLog;
		}

		virtual const bool IsDebuggingActive() const;

		virtual const uint32 GetAllocatedMemory() const
		{
			return m_AllocatedMemory;
		}

		virtual void StopSPUs()
		{
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
			}
		}

		virtual void ContinueSPUs()
		{
			for(uint32 i=0; i<m_NumSPUAllowed; ++i)
			{
				WriteSPUProbReg(i, scPCSPURunCntl, 1);
				snRawSPUNotifySPUStarted(i);
			}
		}

		virtual void GcmInit
		(
			const uint32 cCountDeviceThreadEA,
			void *const __restrict pReportLoc,
			const uint32 cRSXBase, 
			CellGcmContextData *const __restrict pCellGcmCurrentContext,
			const uint32 cCurGCMCmdOffset,
			const uint32 cGCMCmdResetOffset,
			const uint32 cInjectBufOff
		)
		{
			//get offset of main command buffer
			m_GcmAddressBase											= cCurGCMCmdOffset;
			m_PageInfo.gcmCmdAddressBase					= cCurGCMCmdOffset;
			m_CellGcmCurrentContext								= (uint32)pCellGcmCurrentContext;
			m_PageInfo.gcmGlobalPPUContext				= (uint32)pCellGcmCurrentContext;
			CellGcmControl *const pGCMControlReg	= cellGcmGetControlRegister();
			m_PageInfo.gcmGlobalPPUControlReg			= offsetof(CellGcmControl, put) + (uint32)(void*)pGCMControlReg;
			m_PageInfo.gcmRsxBaseAddress					= cRSXBase;
			m_PageInfo.reportEA										= (unsigned int)pReportLoc;
			m_PageInfo.gcmCountDeviceThreadEA			=	cCountDeviceThreadEA;
			m_PageInfo.profilingEnabled						= 0;
			m_PageInfo.funcProfTimingEA						= (uint32)m_pFuncProfilingArea;
			m_PageInfo.gcmInjectBufOff						= cInjectBufOff;
			m_PageInfo.gcmCmdResetOffset					= cGCMCmdResetOffset;
		}

		//enables/disables spu profiling(0 disabled, 1 job mode, 2 func mode)
		virtual void EnableSPUProfiling(const uint8 cEnable=0)
		{
			m_PageInfo.profilingEnabled						= cEnable;
		}

		virtual bool IsSPUProcessing(const unsigned int cSPUIIndex) const;
		
		bool IsPrintfEnabled() const { return m_bEnablePrintf; }
	private:
		//this symbol also acts as the relative base of global/static variables, mangled name _ZN4NPPU10CJobManSPU17scInitalSPULoaderE
		static uint32 scInitalSPULoader[NSPU::scLoaderTextSizeBytes >> 2] _ALIGN(128);	//initial SPU loader
		static NSPU::SLoaderParams scSPULoaderParam;	//loader parameters, cannot be on the stack because its getting DMAd from
		
		CellSpurs m_SPURS _ALIGN(128);//spurs memory

		SQueueNodeSPU m_SPUJobQueue _ALIGN(128);	//SPU job queue node where jobs are pushed into and from

		//following 16 bytes are written bytes SPU, do not change order or alignment, the full cache line is used by atomics
		volatile int m_SpinLock _ALIGN(128);			//spin lock memory, we just need non reentrant locking
		//must reside here, is accessed by SPU
		uint8	 m_CurSpuPacketSyncIndex;						//current used spu packet sync index, 1..127 (byte 0 used for locking)
		uint8	 pad[3];
		uint32 m_SpuIDs[scMaxSPU];								//raw SPU IDs, one for each logical SPU (8 bytes are written by SPU as well)
		//accessed by job spawn api, offset must stay 128 above spin lock
#if defined(SUPP_SPU_FRAME_STATS)
		uint32 m_CurFrameBufIndex _ALIGN(128);		//current index of frame triple profiling buffer (0..(JOB_FRAME_STATS_BUFS-1))
		SFrameProfileData *m_pFrameProfileData _ALIGN(16);		//array of frameprofiledata, JOB_FRAME_STATS_BUFS for each job (to cope with cross frame stats)
		std::vector<SFrameProfileData> m_CurFrameProfVec[JOB_FRAME_STATS_BUFS];//current stats, pointer is exposed
#endif
		uint32 m_JobNum  _ALIGN(128);							//total number of distinct spu jobs loaded from repository
		sys_interrupt_tag_t m_SpuIntTags[scMaxSPU] _ALIGN(128); //raw SPU interrupt tag
		sys_interrupt_thread_handle_t m_SpuIntHandle[scMaxSPU]; //raw SPU interrupt handle
		uint32 m_RealCurPushAddress;							//current push address to be used by PPU, SPU might see temporarily an older one (if so, it is != ~0)
		NSPU::NDriver::SInfoBlock *m_pLastAddedInfoBlock;	//pointer to last recently added info block
//#if defined(SUPP_SN)
		spu_mod_hdr* m_pElfInfo;									//elf info of driver
//#else
//		NSPU::NElf::SElfInfo*	m_pElfInfo;				//elf info of driver
//#endif
		//align here again to 128 bytes to safely and fast use the locking above
		CSPUMemAreaMan m_MemAreaMan _ALIGN(128);	//memory allocation/deallocation request manager for SPU
		NSPU::SPageDirInfo m_PageInfo _ALIGN(16);	//page info for SPUDriver's
		NPageBin::SHeader *m_pSPURep;							//pointer to SPU repository, mem is 128 byte aligned
		NSPU::SPageInfo *m_pPageDir;							//page dir for SPU, each page gets the EA
		uint32* m_pFuncProfilingArea;							//timing destination n for each job from SPU, cannot overlap in any cache line, 128 byte aligned, dyn.allocated
#ifdef DO_SPU_FUNCPROFILING
		uint32* m_pFuncProfilingAreaLastFrame;		//last frames spu timing values, no count here
		char *	m_pFuncProfStringTable;						//string table for function profile secs
		uint16* m_pFuncStringIndices;							//indices into string table 
		uint32 m_FuncProfVecClearCnt;							//cached number of 16 bytes to clear on function profile retrieval
		std::vector<SFrameProfileData> m_FuncProfStats;//area to store last frames function profiling stats
#endif
		std::vector<SJobStringHandle> m_JobStringTable;	//sorted job string table to retrieve a handle from a string
		std::map<uint32,uint32>	m_JobNameMap;			//to name job handles to their respective job name
		TJobStringVec m_JobStringOffsets;					//pointer to job strings for each job id
		ILog *m_pLog;															//pointer to log file
		uint32 m_NumSPUAllowed;										//number of SPUs allowed to be used for job scheduling, 1..scMaxSPU
		uint32 m_DriverSize;											//driver size, SPU jobs must place data after it
#if defined(SUPP_SN)
		void*	 m_SPUJobDebugHandle;								//current job handle for debugging is to be enabled, 0 if non default
		bool	 m_SPUDriverDebuggingEnabled;				//true if spu driver debugging is to be enabled
		mutable bool	 m_DebuggingActive;					//true if debugging is currently active
		bool	 m_FrameDebuggingActive;						//true if debugging was anytime active this frame
#endif
		bool	 m_Initialized;											//true if SPUs have been initialized
		int		 m_SpursInitialized;								//>0 if SPURS has been initialized
#if defined(DO_SPU_PROFILING)
		int*	 m_pProfStatControl;								//variable to check if the profiling data should be transferred back
#endif
		uint32 m_AllocatedMemory;									//current allocated memory
		uint32 m_GcmAddressBase;									//base address for cellGcmAddressToOffset, passed to SPUs
		uint32 m_CellGcmCurrentContext;						//address of global gcm-context
		TSPUFreeFunc m_FreeFunc;									//function to free SPU freed memory with
		TSPUMallocFunc m_MallocFunc;							//function to alloc SPU memory with

		uint8* m_pBucketDirMem[NPPU::scMaxSPU];		//allocated bucket memory for each SPU

		bool m_bEnablePrintf;											// remeber if printf should be allowed
		// singleton stuff
		CJobManSPU(void* pDriver);
		CJobManSPU(const CJobManSPU&);
		CJobManSPU& operator= (const CJobManSPU&);

		//loaded on an interrupt PPU thread, handles SPUs printf request
		static void HandleSpuInterrupt(uint64_t spuId);//use uint64_t since otherwise we earn a warning

		//registers an interrupt handler for SPU/PPU synchronization and SPU printf's for a SPU with id spuId
		//returns true if it has successfully been registered
		const bool CreateRawSpuIntrHandler(const sys_raw_spu_t cSPUId, const uint32 cIndex);

		//loads the SPU loader and the SPU driver into SPU local store
		//	cRealSPUId			:  ID for the SPU
		void LoadSPULoaderDriver
		(
			const unsigned int cRealSPUId, 
			const unsigned int cSPUIndex,
			const bool cIsRecreate = false
		) const;

		//gets job slot for next job (to get storage index for SJobdata), waits until a job slots becomes available again since data get overwritten
		const EAddJobRes GetSPUJobSlot(uint32& __restrict rJobSlot, uint32& __restrict rNextPush);

		//creates the DMA list for a job
		void CreateDMAListSingle(const uint32 cJobParamSize, uint8* const __restrict pParamAddr,	const CSPUJobDel& crJob);

		//retrieves the number of available space in SPU
		__attribute__((always_inline))
		const unsigned int GetAvailableSPUSize() const
		{
			static const unsigned int scReservedStackSize = 16 * 1024 + 752/*initial stack address*/;//reserve 16 KB
			return 256 * 1024 - scReservedStackSize - m_DriverSize;
		}

		//retrieves spu memory control
		__attribute__((always_inline))
		CSPUMemAreaMan& GetMemAreaMan()
		{
			return m_MemAreaMan;
		}

		__attribute__((always_inline))
		SQueueNodeSPU& GetJobQueue()//for callback access
		{
			return m_SPUJobQueue;
		}

		//sends a MFC DMA command to a SPU
		//	cSPUId - id of spu
		//	cLS - destination local store address
		//	cEA - effective address for DMA transfer
		//	cSize - size of data to transfer
		//	cDMATag - DMA tag of the transfer
		//	cCommand - mfc command to be executed
		void SendMFCCmd
		(
			const unsigned int cSPUId, 
			const unsigned int cLS, 
			const uint32 cEA, 
			const unsigned int cSize, 
			const unsigned int cDMATag, 
			const unsigned int cCommand
		) const;

		//syncs a dma command
		//	cSPUId - id of spu
		//	cDMATag - DMA tag of the transfer
/*		void SyncMFCCmd
		(
			const unsigned int cSPUId, 
			const unsigned int cDMATag
		) const;
*/
		//initializes the bucket headers for each SPU
		void InitBucketMemory();

		//creates the job handle <-> string table
		void CreatePageJobStringTable();

		//creates the page directory for SPU, each page index gets the PPU EA
		void CreateSPUPageDir();

		//executes PPU call initiated from SPU
		void ExecPPUCall(const uint32 cArg, const unsigned int);

		//cleans up allocated/freed memory from SPU, cIgnoreProcessing true ignores the running state of SPU
		//  this allows the cleanup to be called from SPU itself (which is then in the running state)
		void UpdateSPUMemManSingleSPU(const uint32 cSPUIndex, const bool cIgnoreProcessing = false);

		ILINE void InitLock()
		{
			m_SpinLock = 0;
		}

		//locks the job manager
		ILINE void Lock()
		{
			CrySpinLock(&m_SpinLock, 0, 1);
		}

		//locks the job manager
		ILINE void UnLock()
		{
			m_SpinLock = 0;//only 1 thread should be allowed to unlock it
		}

#if defined(SUPP_SN)
		ILINE void SetDebuggingActive(const bool cActive)
		{
			m_DebuggingActive = cActive;
			if(cActive)
				m_FrameDebuggingActive = true;
		}
#endif

#if defined(SUPP_SPU_FRAME_STATS)
		ILINE SFrameProfileData* GetFrameProfData(const int cIndex, const uint32 cJobIndex)
		{
			return &m_pFrameProfileData[cJobIndex * JOB_FRAME_STATS_BUFS + cIndex];
		}
#endif

		ILINE void LockFuncProfilingArea()
		{
#ifdef DO_SPU_FUNCPROFILING
			CrySpinLock((volatile int*)m_pFuncProfilingArea, 0, 1);
#endif
		}

		ILINE void UnlockFuncProfilingArea()
		{
#ifdef DO_SPU_FUNCPROFILING
			*(uint32*)m_pFuncProfilingArea = 0;//release spin lock
#endif
		}

		ILINE void ResetFuncProfilingArea()
		{
#ifdef DO_SPU_FUNCPROFILING
			//first clear all but first cache line (atomic lock)
			vec_uint4 *__restrict pFuncProfilingArea16 = (vec_uint4*)((uint8*)m_pFuncProfilingArea + 128);
			const uint32 cVecClearCnt = (m_FuncProfVecClearCnt>0) ? m_FuncProfVecClearCnt-1 : 0;
			for(uint32 i=0;i<cVecClearCnt;++i)
			{
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
				*pFuncProfilingArea16++ = (vec_uint4){0,0,0,0}; 
			}
#endif
		}

		const char* RetrieveCurrentSPUJob(const uint32 cSPUId)const;

		friend void HandleSpuInterrupt(uint64_t spuId);
	};
}//NPPU

__attribute__((always_inline))
inline bool IsSPUNonThreadedProcessing(const unsigned int cSPUIIndex)
{
#if defined(SUPP_SN)
	//spu_mod_hdr->pad holds the current processing state(first 32 bytes in non PIC SPUDriver image)
	const uint32 cState = NPPU::ReadSPULS(cSPUIIndex, 28);
	return (SPUWaitState != cState && SPUPollState != cState && SPUThreadState != cState);
#else
	return false;
#endif
}

__attribute__((always_inline))
inline void NPPU::CJobManSPU::CreateDMAListSingle
(
	const uint32 cJobParamSize,
	uint8* const __restrict pParamAddr,
	const CSPUJobDel& crJob
)
{
	assert((cJobParamSize & 0xF) == 0);
	assert(cJobParamSize < NSPU::NDriver::SInfoBlock::scAvailParamSize - 4/*space for no packet ptr for simplicity*/);
	uint8 *__restrict pCurParamDataDest = (uint8*)pParamAddr;
	const CCommonDMABase* __restrict pCurPacketData = (const CCommonDMABase*)&crJob;
	const void* const __restrict cpPacketSrc = pCurPacketData->GetJobParamData();
	memcpy((void* __restrict)pCurParamDataDest, cpPacketSrc, cJobParamSize);
	pCurParamDataDest += cJobParamSize;
}

#endif //PS3
#endif //__JOBMAN_SPU_H
