/* 
	producer (1 PPU thread) - consumer queue (1 SPU)

	- all implemented ILINE using a template:
		- ring buffer size (num of elements)
		- instance type of job
		- param type of job
	- Factory with macro instantiating the queue (knowing the exact names for a job)
	- queue consists of:
		- ring buffer consists of 1 instance of param type of job
		- for atomic test on finished and push pointer update, the queue is 128 byte aligned and push, pull ptr and 
				DMA job state are lying within that first 128 byte
		- volatile push (only modified by PPU) /pull (only modified by SPU) pointer, point to ring buffer, both equal in the beginning
		- job instance (create with def. ctor)	
		- DMA job state (running, finished)
		- AddPacket - method to add a packet
			- wait on current push/pull if any space is available
			- need atomically test if a SPU is running and update the push pointer, if SPU is finished, start new job
		- Finished method, returns push==pull
		- WaitForSPU method doing a poll with nops

	PPU job manager side:
		- provide RegisterProdConsumerQueue - method, set flags accordingly
	SPU side: 
		- check if it has  a prod/consumer queue
		- if it is part of queue, it must obtain DMA address for job state and of push/pull (static offset to pointer)
		- a flag tells if job state is job state or queue (eaExtJobStateAddress becomes the queue address)
		- lock is obtained when current push/pull ptr is obtained, snooping therefore enabled
		- before FlushCacheComplete, get next parameter packet if multiple packets needs to be processed
		- if all packets were processed, try to write updated pull pointer and set finished state
				if it fails, push pointer was updated during processing time, in that case get lock again and with it the new push pointer
		- loop til the lock and finished state was updated successfully
		- no HandleCallback method, only 1 SPU is permitted to run with queue
		- no write back to external job state, always just one inner packet loop

*/

#ifndef __PROD_CONS_QUEUE_H
#define __PROD_CONS_QUEUE_H
#pragma once

#if defined(PS3)

#if !defined(__SPU__)
	#include <stdio.h>
	#include <stdlib.h>
	#if defined(SUPP_SN)
		#include <lib/libsn.h>
	#endif
	#include <ppu_intrinsics.h>
#endif

#if !defined(JOB_LIB_COMP)
	#define DEVIRTUALIZE_IJOBMAN_PRODQUEUE
#endif
#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
	#include "JobManSPU.h"
#else
	#include <IJobManSPU.h>
#endif

#if !defined(ILINE)
	#define ILINE __attribute__((always_inline)) inline
#endif

//enable to perform dead lock checks or SPU job hang ups
//#define CHECK_DEADLOCK
#if defined(CHECK_DEADLOCK)
	#if defined(SUPP_SN)
		extern uint32 g_ForceStopSPUs;
	#endif
	#if defined(_DEBUG)
		#define MAX_IT 1000000
	#else
		#define MAX_IT 10000000
	#endif
#endif

#if defined(CHECK_DEADLOCK)
	#define INLINE_POL inline
#else
	#define INLINE_POL ILINE
#endif

namespace NPPU
{
#if !defined(__SPU__)

	//implements atomically: if(*pCompDst == cCompVal){ *pDst = cVal; return 1;} else return 0;
	//	as it can only reserve a cache line on an 8 byte base, we have to use a union
	ILINE int InterlockedCompareExchangeEx
	(
		volatile uint32_t * const pDst,
		const uint32_t cVal,
		volatile uint32_t *const pCompDst,
		const uint32_t cCompVal,
		const uint32_t cSrcCompVal
	)
	{
		union 
		{
			uint64_t ldVal;
			struct  
			{
				uint32_t val0;
				uint32_t val1;
			};
		} queueValDst, queueValSrc;
		queueValDst.val0 = cCompVal;
		queueValDst.val1 = cVal;
		queueValSrc.val0 = cCompVal;
		queueValSrc.val1 = cSrcCompVal;
/*		
		uint32_t ret = 0;//return value
		uint32_t old;//destination for current value
		__asm__ volatile
		(
			".loop%=:															# loop start\n"
			"	ldarx   %[old], 0, %[compdst]				# load and reserve\n"
			"	cmpd    %[old], %[comperand]				# (*pCompDst == cCompVal)?\n"			
			"	bne-    .Ldone%=										# break since (*pCompDst != cCompVal)\n"									
			"	stdcx.  %[exchange], 0, %[dst]			# *pDst = cVal\n"
			"	beq-    .exsuc%=										# check if write successful\n"	
			" db16cyc											        # give other hardware thread chance to run\n"
			" db16cyc											        # give other hardware thread chance to run\n"
			" db16cyc											        # give other hardware thread chance to run\n"
			" db16cyc											        # give other hardware thread chance to run\n"
			"	b    .loop%=											  # loop if lost reservation\n"	
			".exsuc%=:														# exchange written\n"
			"	addi		%[ret], %[ret], 1						# ret = 1\n"
			".Ldone%=:														# loop end\n"							
			: [old]"=&r"(old), [ret]"=&r"(ret)
			: [dst]"b"(pDst), [compdst]"b"(pCompDst), [comperand]"r"(queueValSrc), [exchange]"r"(queueValDst.ldVal)
			: "cc", "memory"
		);
*/
		do 
		{
			if (__ldarx((uint64_t*__restrict)pDst) != queueValSrc.ldVal)
				return 0;
		} while (0 == __stdcx((uint64_t*__restrict)pDst, queueValDst.ldVal));
		return 1;//set to 1 in asm if successful
	}

	#define IDLE_NOPS 32				//number of nops performed in each wait loop iteration

	template <class TJobType, unsigned int Size>
	class CProdConsQueue
	{
	public:
		CProdConsQueue(const bool cKeepCache = false);																//default ctor
		~CProdConsQueue();

		void AddPacket
		(
			const typename TJobType::packet& crPacket, 
			const uint32 cMinStackSizeKB = CACHE_MIN_STACK_SIZE,
			const NPPU::ECacheMode cCacheMode = eCM_64
		);	//adds a new parameter packet (job invocation)
		void WaitFinished();										//wait til all current jobs have been finished and been processed by a SPU
		bool IsEmpty();													//returns true if queue is empty
//#if defined(DO_SPU_PROFILING)
		const volatile NSPU::NDriver::SJobPerfStats& GetPerfStats();	//return current performance stats
//#endif

	private:
		//------------start of synchronized cache line-------------------------------------------
		volatile unsigned int m_DMAJobState;								//dma job state
		void* m_pPush;																			//push pointer, current ptr to push packets into (written by PPU)
		volatile void* m_pPull;															//pull pointer, current ptr to pull packets from (written by SPU)
		unsigned int pad;																		//pad to keep 16 byte aligned
		unsigned int m_PullIncrement;												//increment of pull, also accessed by SPU
		unsigned int m_RingBufferStart;											//start of ring buffer (to swap properly by SPU), also accessed by SPU
		unsigned int m_RingBufferEnd;												//end of ring buffer (to swap properly by SPU), also accessed by SPU
		unsigned int m_AddPacketDataOffset;									//offset of additional data relative to push ptr
		unsigned int pad2[24];															//padding for the first full cache line
		//------------end of synchronized cache line-------------------------------------------
		void* m_pRingBuffer;																//the ring buffer
//#if defined(DO_SPU_PROFILING)
		volatile NSPU::NDriver::SJobPerfStats g_PerfStatsJob;//job specific performance stats location
//#endif
		TJobType m_JobInstance;							//job instance
		int	m_Initialized;
#if defined(SUPP_SN)
		int m_EncounteredDebug;
#endif
		void Init(const unsigned int cPacketSize);		//initializes queue

		void* const GetIncrementedPointer() ;//get incremented ptr, takes care of wrapping

	} _ALIGN(128);//align for DMA speed and cache line reservation

#endif //__SPU__

	struct SAddPacketData
	{
		unsigned int stackSizeKB;
		unsigned int cacheMode;
		unsigned int pad[2];
	} _ALIGN(16);

	//keep offsets in sync, do not use offsetof since this would make the struct code fully known to the SPU
	static const unsigned int scJobFinished				= 0x0;											//constant for SPU job is finished
	static const unsigned int scJobRunning				= 0x1;											//constant for SPU job running
	static const unsigned int scProdConsPushOff		= 4;												//offset of m_pPush
	static const unsigned int scProdConsPullOff		= 8;												//offset of m_pPull
	static const unsigned int scProdConsPullIncr	= 16;												//offset of m_PullIncrement
	static const unsigned int scProdConsPullStart	= scProdConsPullIncr + 4;		//offset of m_RingBufferStart
	static const unsigned int scProdConsPullEnd		= scProdConsPullStart + 4;	//offset of m_RingBufferEnd
}//NPPU


#if !defined(__SPU__)

template <class TJobType, unsigned int Size>
ILINE NPPU::CProdConsQueue<TJobType, Size>::~CProdConsQueue()
{
	if(m_Initialized && m_pRingBuffer)
		GetIJobManSPU()->Free(m_pRingBuffer);
}

template <class TJobType, unsigned int Size>
ILINE NPPU::CProdConsQueue<TJobType, Size>::CProdConsQueue(const bool cKeepCache) : m_Initialized(0)
{
	assert(Size > 2);
	m_DMAJobState = scJobFinished;
	m_JobInstance.RegisterQueue((void*)this, cKeepCache);
//#if defined(DO_SPU_PROFILING)
	m_JobInstance.SetJobPerfStats(&g_PerfStatsJob);
//#endif
}

template <class TJobType, unsigned int Size>
ILINE void NPPU::CProdConsQueue<TJobType, Size>::Init(const unsigned int cPacketSize)
{
	assert((cPacketSize & 15) == 0);
	m_AddPacketDataOffset = cPacketSize;
	m_PullIncrement				= m_AddPacketDataOffset + sizeof(SAddPacketData);
	m_pRingBuffer = GetIJobManSPU()->Allocate(Size * m_PullIncrement, 128);
	assert(m_pRingBuffer);
	m_pPush = m_pRingBuffer;
	m_pPull = m_pRingBuffer;
	m_RingBufferStart	= (unsigned int)m_pRingBuffer;
	m_RingBufferEnd		= m_RingBufferStart + Size * m_PullIncrement;
	m_Initialized			= 1;
	((TJobType*)&m_JobInstance)->SetParamDataSize(cPacketSize);
#if defined(SUPP_SN)
	m_EncounteredDebug = 0;
#endif
}

template <class TJobType, unsigned int Size>
INLINE_POL void NPPU::CProdConsQueue<TJobType, Size>::WaitFinished() 
{
#if defined(CHECK_DEADLOCK)
	volatile int counter = 0;
#endif
	while(m_DMAJobState != scJobFinished)
	{
		for(int i=0; i<IDLE_NOPS/16; ++i)
		{
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
		}
#if defined(SUPP_SN)
	#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
		CJobManSPU *const __restrict pIJobMan = CJobManSPU::Instance();
		if(!m_EncounteredDebug && pIJobMan->CJobManSPU::IsDebuggingActive())
			m_EncounteredDebug = 1;
	#else
		NPPU::IJobManSPU* const pIJobMan = GetIJobManSPU();
		if(!m_EncounteredDebug && pIJobMan->IsDebuggingActive())
			m_EncounteredDebug = 1;
	#endif
	#if defined(CHECK_DEADLOCK)
		if(m_EncounteredDebug && !g_ForceStopSPUs)
	#else
		if(m_EncounteredDebug)
	#endif
			continue;
#endif
#if defined(CHECK_DEADLOCK)
	#if defined(SUPP_SN)
		if(g_ForceStopSPUs || counter++ > MAX_IT)
	#else
		if(counter++ > MAX_IT)
	#endif
		{
			const unsigned int cPull	= (unsigned int)m_pPull;
			const unsigned int cPush	= (unsigned int)m_pPush;
			const unsigned int cState = m_DMAJobState;
			printf("Deadlock in CProdConsQueue::WaitFinished(job: \"%s\"), pull=0x%08x   push=0x%08x  state=%s\n",m_JobInstance.GetJobName(),cPull, cPush, (cState == scJobRunning)?"Running":"Finished");
#if defined(SUPP_SN)
	#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
			if(0 == pIJobMan->CJobManSPU::VerifySPUs())
	#else
			if(0 == pIJobMan->VerifySPUs())
	#endif
			{
				printf("   Stopped all running SPUs(index):");
				const uint32 cNumSPUAllowed = GetIJobManSPU()->GetSPUsAllowed();
				for(uint32 i=0; i<cNumSPUAllowed; ++i)
				{
#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
					if(!IsSPUNonThreadedProcessing(i))
#else
					if(!pIJobMan->IsSPUProcessing(i))
#endif
						continue;
					printf("  %d",i);
					NPPU::WriteSPUProbReg(i, NPPU::scPCSPURunCntl, 0);
					snRawSPUNotifySPUStopped(i);
				}
				printf("\n");
				while(1){}
			}
			else
			{
				m_EncounteredDebug = 1;
				continue;
			}
#endif
			break;
		}
#endif
	}
	assert((uint32)m_pPush == (uint32)m_pPull);
#if defined(SUPP_SN)
	m_EncounteredDebug = 0;//reset
#endif
}

template <class TJobType, unsigned int Size>
ILINE bool NPPU::CProdConsQueue<TJobType, Size>::IsEmpty() 
{
	return (unsigned int)m_pPush == (unsigned int)m_pPull;
}

//#if defined(DO_SPU_PROFILING)
template <class TJobType, unsigned int Size>
ILINE const volatile NSPU::NDriver::SJobPerfStats& NPPU::CProdConsQueue<TJobType, Size>::GetPerfStats()
{
	return g_PerfStatsJob;
}
//#endif //DO_SPU_PROFILING

template <class TJobType, unsigned int Size>
ILINE void* const NPPU::CProdConsQueue<TJobType, Size>::GetIncrementedPointer() 
{
	//returns branch free the incremented wrapped aware param pointer
	const unsigned int cNextPtr = (unsigned int)m_pPush + m_PullIncrement;
	//	if(cNextPtr == m_RingBufferEnd)	cNextPtr = m_RingBufferStart;
	const unsigned int cNextPtrMask = (unsigned int)(((int)(cNextPtr - m_RingBufferEnd)) >> 31);
	return (void*)(cNextPtr & cNextPtrMask | m_RingBufferStart & ~cNextPtrMask);
}

template <class TJobType, unsigned int Size>
inline void NPPU::CProdConsQueue<TJobType, Size>::AddPacket
(
	const typename TJobType::packet& crPacket, 
	const uint32 cMinStackSizeKB,
	const NPPU::ECacheMode cCacheMode
)
{
	const uint32 cPacketSize = crPacket.GetPacketSize();
	if(__builtin_expect(m_Initialized == 0, 0))
		Init(cPacketSize);

	assert(m_RingBufferEnd == m_RingBufferStart + Size * (cPacketSize + sizeof(SAddPacketData)));

#if defined(CHECK_DEADLOCK)
	volatile int counter = 0;		
#endif
	const void* const cpCurPush = m_pPush;
	while(__builtin_expect((cpCurPush == m_pPull) && (m_DMAJobState != scJobFinished), 0))//wait til a slot becomes available
	{
		for(int i=0; i<IDLE_NOPS / 8; ++i)
		{
			__db16cyc();
			__db16cyc();
			__db16cyc();
			__db16cyc();
		}
#if defined(SUPP_SN)
	#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
		CJobManSPU* const pIJobMan = (CJobManSPU*)GetIJobManSPU();
		if(!m_EncounteredDebug && pIJobMan->CJobManSPU::IsDebuggingActive())
			m_EncounteredDebug = 1;
	#else
		NPPU::IJobManSPU* const pIJobMan = GetIJobManSPU();
		if(!m_EncounteredDebug && pIJobMan->IsDebuggingActive())
			m_EncounteredDebug = 1;
	#endif
	#if defined(CHECK_DEADLOCK)
		if(m_EncounteredDebug && !g_ForceStopSPUs)
	#else
		if(m_EncounteredDebug)
	#endif
			continue;
#endif
#if defined(CHECK_DEADLOCK)
	#if defined(SUPP_SN)
		if(g_ForceStopSPUs || counter++ > MAX_IT)
	#else
		if(counter++ > MAX_IT)
	#endif
		{
			printf("Deadlock in CProdConsQueue::AddPacket(job: \"%s\")\n",m_JobInstance.GetJobName());
	#if defined(SUPP_SN)
		#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
			if(0 == pIJobMan->CJobManSPU::VerifySPUs())
		#else
			if(0 == pIJobMan->VerifySPUs())
		#endif
			{
				printf("   Stopped all running SPUs(index):");
				const uint32 cNumSPUAllowed = GetIJobManSPU()->GetSPUsAllowed();
				for(uint32 i=0; i<cNumSPUAllowed; ++i)
				{
#if defined(DEVIRTUALIZE_IJOBMAN_PRODQUEUE)
					if(!IsSPUNonThreadedProcessing(i))
#else
					if(!pIJobMan->IsSPUProcessing(i))
#endif
						continue;
					printf("  %d",i);
					NPPU::WriteSPUProbReg(i, NPPU::scPCSPURunCntl, 0);
					snRawSPUNotifySPUStopped(i);
				}
				printf("\n");
				while(1){}
			}
			else
			{
				m_EncounteredDebug = 1;
				continue;
			}
	#endif
			break;
		}
#endif
	};
	//get incremented push pointer and check if there is a slot to push it into
	void* const cpNextPushPtr = GetIncrementedPointer();

	const vec_uint4 * __restrict pPacketCont = crPacket.GetPacketCont();
	vec_uint4 * __restrict pPushCont				 = (vec_uint4*)cpCurPush;
	const uint32 cIters = cPacketSize >> 4;
	for(uint32 i=0;i<cIters; ++i)//copy packet data
		pPushCont[i] = pPacketCont[i];

	//encode stack size, the SPU must get the updated size even in case no new job is spawned
	SAddPacketData* const __restrict pAddPacketData = (SAddPacketData*)((unsigned char*)cpCurPush + m_AddPacketDataOffset);
	pAddPacketData->stackSizeKB = cMinStackSizeKB;
	pAddPacketData->cacheMode		= (unsigned int)cCacheMode;

	if(m_DMAJobState == scJobFinished)//job is already finished, since only one SPU is active, no need for synchronizations
	{
		m_pPush = cpNextPushPtr;//make visible to SPU
		m_JobInstance.SetMinStackSizeKB(cMinStackSizeKB);
		m_JobInstance.SetCacheMode(cCacheMode);
		m_DMAJobState = scJobRunning;
		m_JobInstance.Run();
	}
	else
	{
		//push pointer needs to be atomically updated together with the check if a SPU job has been finished
		//this way it is ensured that the push pointer is either known to the SPU (which is snooping on that cache line) 
		//	or a new job is to be started
		//cCurJobState is suppose to be have state running if we get here
		//we must be sure that push pointer is only updated if the state is not switching to finished (reservations can operate max on 8 byte boundary)
		if(0 == InterlockedCompareExchangeEx((volatile unsigned int*)&m_DMAJobState, (unsigned int)cpNextPushPtr, (volatile unsigned int*)&m_DMAJobState, scJobRunning, (unsigned int)cpCurPush))
		{
			//job state has switched to finished in the meantime
			assert(m_DMAJobState == scJobFinished);
			m_pPush = cpNextPushPtr;//make visible to SPU
			m_JobInstance.SetMinStackSizeKB(cMinStackSizeKB);
			m_JobInstance.SetCacheMode(cCacheMode);
			m_DMAJobState = scJobRunning;
			m_JobInstance.Run();
		}
	}
}

#undef DEVIRTUALIZE_IJOBMAN_PRODQUEUE

//type macros for SPU jobs
#define PROD_CONS_QUEUE_TYPE(name, size) NPPU::CProdConsQueue<name, (size)>

#endif //__SPU__

#endif //PS3
#endif //__PROD_CONS_QUEUE_H
