/* 
	declaration of job structures
*/

#ifndef __SPU_JOBSTRUCTS_H
#define __SPU_JOBSTRUCTS_H
#pragma once

#if defined(PS3)

#include "DMAElement.h"

#define USE_JOB_QUEUE_VERIFICATION

//forward declarations for friend usage
namespace NPPU
{
#if defined(SUPP_SPU_FRAME_STATS)
	struct SFrameProfileData;
	//max number of jobs profiled per frame
	#define MAX_PROFILE_JOBS 256
#endif

#if !defined(_SPU_JOB)
	class CJobManSPU;
#endif
	//maximum number of jobs allowed in spu queue, use as low as possible to reduce cache misses
	//is between 2 - 128 byte aligned structs, adjust count to reduce storage waste and cache misses
	//should be as small as possible to reuse Cache efficiently, do not raise above 128 (dma index going out of range)
	static const unsigned int scMaxWorkQueueJobs = 32;
	//callback information
	struct SCallback
	{
		void (*pCallbackFnct)(void*);	//callback function, null if no
		void* __restrict pArg;										//argument to pCallbackFnct

		__attribute__((always_inline))
		inline SCallback() : pCallbackFnct(0), pArg(0){}
	};

	//struct for holding job beyond info block, callback data and job state (must be 16 byte aligned for DMA transfer)
	//corresponds directly to job info block entry (kept in separate arrays for the sake of memory saving)
	//always completely copied during SPU job spawning
	struct SJobData
	{
		NPPU::SCallback callbackData;							//callback data, 8 bytes
#if defined(USE_JOB_QUEUE_VERIFICATION)
		unsigned long long jobStartTime;					//job start time in microseconds, used for timeouts
#else
		unsigned long long pad;										//keep at multiple of 16 bytes
#endif
	} _ALIGN(16);
}

namespace NSPU
{
	namespace NDriver
	{
#if !defined(_SPU_JOB)
		struct SInfoBlock;
#endif
		//condition variable like struct to be used for polling if a job has been finished
		//since it is set by the SPU by DMA, it has to be on a 16 byte boundary (alignment ensured by padding in SJobData)
		struct SJobState
		{
#if !defined(__SPU__)
		private:
#endif
			volatile unsigned int running;	//1 if running, 0 otherwise
#if !defined(_SPU_JOB)
			friend struct SInfoBlock;
			friend class NPPU::CJobManSPU;
#endif
		public:
			__attribute__((always_inline))
			inline SJobState() : running(0)	{}

			__attribute__((always_inline))
			inline volatile const bool IsRunning() const volatile
			{
				return (running != 0);
			}
		};

		//same struct but aligned at 16 bytes
		struct SExtJobState : public SJobState
		{
		private:
			volatile unsigned int pad[3];	
#if !defined(_SPU_JOB)
			friend struct SInfoBlock;
			friend class NPPU::CJobManSPU;
#endif
		public:
			__attribute__((always_inline))
			inline SExtJobState() : SJobState()	{}
		} _ALIGN(16);
	}
}

namespace NSPU
{
	namespace NDriver
	{
		//performance stats for job
		//cacheHits must be first element for SPU speed processing
		//do not reorder, accessed in asm code
		struct SJobPerfStats
		{
			//always leave cacheHits as first component, incremented directly by address in SPUJob.h
			unsigned int cacheHits;							//cache hits
			unsigned int spuJobTime;						//time for the job itself (in ticks)
			unsigned int allocSize;							//allocation amount
			unsigned int totalBucketAllocSize;	//effective bucket allocation amount

			unsigned int cacheMisses;						//cache misses
			unsigned int cacheWritesBackSync;		//number of cache lines transferred back synchronously (became dirty)
			unsigned int cacheFlushsNoWrite;		//number of cache lines NOT transferred back (non dirty)
			unsigned int prefetchHits;					//prefetch hits

			unsigned int prefetchMisses;				//prefetch misses
			unsigned int cacheWritesBackASync;	//asynchronous write backs
			unsigned int memTransToLS;					//amount of cache memory transferred to LS
			unsigned int memTransFromLS;				//amount of cache memory transferred back from LS

			unsigned short spuSetupTime;				//job time without job function (in ticks)
			unsigned short allocsBucket;				//allocations in job handled by bucket system
			unsigned short lostLineEvents;			//number of lost Line events for Cache write backs
			unsigned short allocsNoBucket;			//allocations in job not handled by bucket system
			unsigned short driverSize;					//size of driver in 16 byte chunks
			unsigned short jobSize;							//size of job in 16 byte chunks
			unsigned short freeCount;						//number of memory releases into bucket allocator
			unsigned short freeCountHistory;		//number of memory releases matching history table

			unsigned int spuCacheMissTime;			//time for the job spent in the cache miss handler
			unsigned short freeCountNoBucket;		//number of memory releases outside bucket allocator
			unsigned short spuFetchTime;				//time between job fetch and call to ProcessJob
			unsigned short atomicOps;						//number of atomic operations issued
			unsigned short firstBubbleSize;			//size of first bubble in 16 bytes
			unsigned short pad;				
			unsigned char  stackSize;						//size of stack for job (in KB)
			unsigned char  cacheSize;						//cache setup for job (in KB)

			unsigned int bubMemTransferred;			//bubble memory transferred in bytes
			unsigned short returnMissHandlerCalls;//number of calls into return miss handler
			unsigned short callMissHandlerCalls;//number of calls into call miss handler
			unsigned short bubbleMisses;				//non present bubbles in SetActiveBubbles
			unsigned short syncsInCallMissHandler;//number of bubble transfer syncs in call miss handler
			unsigned short bubbleMissesRetMissHandler;//non present return bubbles in return miss handler
			unsigned short bubblesTransferred;	//number of bubbles transferred in total to LS	

			void Reset()
			{
				((vec_uint4*)this)[0] = (vec_uint4)0;
				((vec_uint4*)this)[1] = (vec_uint4)0;
				((vec_uint4*)this)[2] = (vec_uint4)0;
				((vec_uint4*)this)[3] = (vec_uint4)0;
				((vec_uint4*)this)[4] = (vec_uint4)0;
				((vec_uint4*)this)[5] = (vec_uint4)0;
			}
		} _ALIGN(16);
	}
}

//#if defined(JOB_LIB_COMP) || defined(__SPU__)
namespace NSPU
{
	namespace NDriver
	{
		//info block transferred first for each job, size if passed to in CJobManSPU::EnqueueSPUJob
		//we have to transfer the info block, parameter block, input memory needed for execution, data and text of job
		//first 16 bytes gets overwritten for job state from SPU, dont store some post job persistent data there
		//parameter data, packet chain ptr and dma list data are transferred at once since they got allocated consecutively
		struct SInfoBlock
		{
			SJobState jobState;								//job state variable, set via DMA from SPU, 4 bytes (16 byte aligned)
			unsigned char minStackSize;				//minimal stack size in KB
			unsigned char frameProfIndex;			//index of SFrameProfileData*
			unsigned char pad;								//available
			unsigned char flags2;							//extended flags, currently not set into spu packets
			unsigned int eaExtJobStateAddress;//external job state address /shared with address of prod-cons queue
			unsigned int eaJobPerfAddress;		//source address of profiling stats for this job

			unsigned int eaDMAJobAddress;							//source address of job program data
			//do not reorder easily, copied by unsigned int casts from here on
			unsigned short jobId;											//corresponding job ID, needs to track jobs
			unsigned short jobSize;										//size of job to transfer (contains .data and .text section) (multiple of 4 bytes)
			//lowest bit is 0 if it is a dependent job, 1 to indicate another packet
			unsigned short firstBubbleIndex;					//index of first bubble job branches into, added to fetch bubble fast
			unsigned char flags;									
			unsigned char depJobIndex;								//index of next dependent job if available, scNoIndex otherwise
			unsigned char opMode;											//bubble mode and cache mode
			unsigned char callbackIndex;							//callback index (index into static job queue structure)
			unsigned char paramSize;									//size in total of parameter block in 16 byte units
			unsigned char spuPacketSyncIndex;					//index into spuPacketSync, 0xFF if it is not used

			static const unsigned int scAvailParamSize = 128 - 32;//size to keep it 128 byte aligned
			static const unsigned int scNoPacketVal		 = 0xFCFCFCFC;//value set if no packet follows

			//parameter data are enclosed within to save a cache miss
			unsigned char paramData[scAvailParamSize];//is 32 byte aligned, make sure it is kept aligned

			static const unsigned int scNoIndex	= 0xFF;//indicates no set callback

			//bits used for flags
			static const unsigned int scFetchable							= 0x1;
			static const unsigned int scHasQueue							= 0x4;
			static const unsigned int scDebugEnabled					= 0x8;
			static const unsigned int scDebugEnabledDriver		= 0x2;

			//bits used for flags2
			static const unsigned int scTransferProfDataBack	= 0x1;

#if defined(__SPU__)
			//force empty ctor to save instruction space
			__attribute__((always_inline))
			inline SInfoBlock(){}
#endif

			__attribute__((always_inline))
			inline const unsigned short GetFirstBubbleIndex() const
			{
				return firstBubbleIndex;
			}

			__attribute__((always_inline))
			inline void SetBubbleMode(const NPPU::EBubbleMode cBubbleMode)
			{
				opMode = (opMode & ~BUBBLE_MODE_MASK) | (unsigned char)cBubbleMode;
			}

			__attribute__((always_inline))
			inline const NPPU::EBubbleMode GetBubbleMode() const
			{
				return (NPPU::EBubbleMode)(opMode & BUBBLE_MODE_MASK);
			}

			__attribute__((always_inline))
			inline void SetCacheMode(const NPPU::ECacheMode cCacheMode)
			{
				opMode = (opMode & ~CACHE_MODE_MASK) | (unsigned char)cCacheMode;
			}

			__attribute__((always_inline))
			inline void SetOpMode(const unsigned int cMode)
			{
				assert(cMode < 255);
				opMode = (unsigned char)cMode;
			}

			__attribute__((always_inline))
			inline const unsigned int GetMaxCacheSize() const
			{
				return ((unsigned int)(opMode & CACHE_MODE_MASK) >> CACHE_MODE_SIZE_SHIFT) << 10;//in KB
			}

			__attribute__((always_inline))
			inline const bool HasQueue() const
			{
				return (flags & (unsigned char)scHasQueue) != 0;
			}

			__attribute__((always_inline))
			inline void EnableDriverDebug(const bool cEnable)
			{
				if(cEnable)
					flags |= scDebugEnabledDriver;
				else
					flags &= ~scDebugEnabledDriver;
			}

			__attribute__((always_inline))
			inline const bool IsDriverDebugEnabled() const
			{
				return (flags & (unsigned char)scDebugEnabledDriver) != 0;
			}

			__attribute__((always_inline))
			inline void EnableDebug()
			{
				flags |= scDebugEnabled;
			}

			__attribute__((always_inline))
			inline const int IsDebugEnabled() const
			{
				return (int)((flags & (unsigned char)scDebugEnabled) != 0);
			}

			__attribute__((always_inline))
			inline void SetFetchable(const bool cFetchable)
			{
				if(cFetchable)
					flags |= scFetchable;
				else
					flags &= ~scFetchable;
			}

			__attribute__((always_inline))
			inline const bool IsFetchable() const
			{
				return (flags & (unsigned char)scFetchable) != 0;
			}

			__attribute__((always_inline))
			inline void SetTransferProfDataBack(const bool cTransfer)
			{
				if(cTransfer)
					flags2 |= scTransferProfDataBack;
				else
					flags2 &= ~scTransferProfDataBack;
			}

			__attribute__((always_inline))
			inline const bool TransferProfDataBack() const
			{
				return (flags2 & (unsigned char)scTransferProfDataBack) != 0;
			}

			__attribute__((always_inline))
			inline void Reset(const void* const cpQueue, const bool cFetchable, const unsigned char cStackSize)
			{
				//reset only required fields which are possibly not set up
				//queue is passed to avoid LHS when setting it
				depJobIndex							= scNoIndex;
				spuPacketSyncIndex			= scNoIndex;
				assert(!jobState.IsRunning());
//				opMode									= BUBBLE_MODE_DEFAULT | CACHE_MODE_DEFAULT;
				callbackIndex						= scNoIndex;
				minStackSize						= cStackSize;
				
				const int cQueueAddr		= (int)cpQueue;
				const unsigned int cQueueMask = (unsigned int)(((int)(-cQueueAddr)) >> 31);
				unsigned int flagSet		= ((unsigned int)scHasQueue & cQueueMask);

				const unsigned int cFetchMask = (unsigned int)(((int)(-(int)cFetchable)) >> 31);
				flagSet									= (flagSet | scFetchable) & cFetchMask | flagSet & ~cFetchMask;

				eaExtJobStateAddress		= (unsigned int)cpQueue;
				flags										= (unsigned char)flagSet;
				flags2									= 0;
			}

			const unsigned int GetMinStackSize() const
			{
				return (unsigned int)minStackSize << 10;//in KB
			}

			__attribute__((always_inline))
			inline void SetDependentJobIndex(const unsigned int cJobSlot)
			{
				assert(cJobSlot < 0xFF);
				depJobIndex = (unsigned char)cJobSlot;
			}

			__attribute__((always_inline))
			inline void SetExtJobStateAddress(const unsigned int cAddr)
			{
				assert(!HasQueue());
				eaExtJobStateAddress = cAddr;
			}

			__attribute__((always_inline))
			inline unsigned int GetExtJobStateAddress() const
			{
				assert(!HasQueue());
				return eaExtJobStateAddress;
			}

			__attribute__((always_inline))
			inline unsigned int GetQueue() const
			{
				assert(HasQueue());
				return eaExtJobStateAddress;
			}

			__attribute__((always_inline))
			inline unsigned char* const __restrict GetParamAddress()
			{
				assert(!HasQueue());
				return &paramData[0];
			}
		} _ALIGN(128);

		//parameter type passed to spu driver
		struct SSpuParam
		{
			unsigned int data[4];
		};

		//sizes for accessing the SQueueNodeSPU array
		static const unsigned int scSizeOfSJobQueueEntry			= 128;	//sizeof SInfoBlock (extra coded because shift)
		static const unsigned int scSizeOfSJobQueueEntryShift = 7;		//SInfoBlock in shifts

		//state information for the fill and empty pointers of the job queue
		//this is usable as shared memory with concurrent accesses from several SPUs at once
		struct SJobQueuePos
		{
			unsigned int					lockObtained;	//keeps track if the consumer state is locked (and by which SPUid, only pull)
			unsigned int					baseAddr;			//base of job queue
			unsigned int					topAddr;			//top of job queue
			volatile unsigned int	curAddr;			//current position in job queue
		} _ALIGN(16);
	}//NDriver
}//NSPU

namespace NPPU
{
	//spu queue node where jobs are pushed into and pulled from
	//individual aligned because it is using MFC_LLAR atomics for mutual exclusive access(operates on a 128 byte address base)
	//dont waste memory by putting the job queue between the 2 aligned buffers
	//address difference between push and pull is set in JobStructs.h -> keep in sync with SQueueNodeSPU
	struct SQueueNodeSPU
	{
		//pull pointer needs to be 128 byte aligned since MFC atomics are used
		NSPU::NDriver::SJobQueuePos push _ALIGN(128);							//position in which jobs are pushed by the PPU
		unsigned int pushPadPush[(128 - sizeof(NSPU::NDriver::SJobQueuePos))/sizeof(unsigned int)];
		//do not change both pull/push positions since they are accessed in order as NSPU::NDriver::SJobQueuePos*
		//leave padding since 128 bytes are transfered back for peak performance
		NSPU::NDriver::SJobQueuePos pull _ALIGN(128);							//position from which jobs are pulled by the SPU, dont add anything in between
		//put push into another cache line to be able to update the new pull pointer from the SPU unconditional
		unsigned int pushPadPull[(128 - sizeof(NSPU::NDriver::SJobQueuePos))/sizeof(unsigned int)];

		//do not change location of spuPacketSync relative to pull
		static const unsigned short scSyncCount = 128;										//number of indices before swapping around
		static const unsigned short scSyncMaxIndex = scSyncCount - 1;			//controls swapping of indices
		unsigned char spuPacketSync[scSyncCount] _ALIGN(128);							//sync area for spu packets, first byte is lock value

		NSPU::NDriver::SInfoBlock jobInfoBlocks[scMaxWorkQueueJobs] _ALIGN(128);//job info blocks fetched by SPU
		SJobData jobData[scMaxWorkQueueJobs] _ALIGN(128);					//additional job data (callback etc.)
#if defined(SUPP_SPU_FRAME_STATS)
		SFrameProfileData* frameProfData[MAX_PROFILE_JOBS];	//frame profiling data
#endif
	};

	//address difference used by SPUs
	static const unsigned int scJobInfoBlocksPullAddressDiff	= offsetof(SQueueNodeSPU, jobInfoBlocks) - offsetof(SQueueNodeSPU, pull);
	static const unsigned int scJobInfoPushPullAddressDiff		= offsetof(SQueueNodeSPU, pull) - offsetof(SQueueNodeSPU, push);
	static const unsigned int scJobInfoBlocksSPUPacketSyncAddressDiff	= offsetof(SQueueNodeSPU, spuPacketSync) - offsetof(SQueueNodeSPU, pull);
	static const unsigned int scJobInfoBlocksJobDataSyncAddressDiff	= offsetof(SQueueNodeSPU, jobData) - offsetof(SQueueNodeSPU, push);
#if defined(SUPP_SPU_FRAME_STATS)
	static const unsigned int scJobInfoBlocksFrameProfDataSyncAddressDiff	= offsetof(SQueueNodeSPU, frameProfData) - offsetof(SQueueNodeSPU, pull);
#endif
}
//#endif//if defined(JOB_LIB_COMP) || defined(__SPU__)

#endif //PS3
#endif //__SPU_JOBSTRUCTS_H
