/* 
	producer (1 PPU thread) - consumer queue (1 SPU)

	- all implemented ILINE using a template:
		- ring buffer size (num of elements)
		- instance type of job
		- param type of job
	- Factory with macro instantiating the queue (knowing the exact names for a job)
	- queue consists of:
		- ring buffer consists of 1 instance of param type of job
		- for atomic test on finished and push pointer update, the queue is 128 byte aligned and push, pull ptr and 
				DMA job state are lying within that first 128 byte
		- volatile push (only modified by PPU) /pull (only modified by SPU) pointer, point to ring buffer, both equal in the beginning
		- job instance (create with def. ctor)	
		- DMA job state (running, finished)
		- AddPacket - method to add a packet
			- wait on current push/pull if any space is available
			- need atomically test if a SPU is running and update the push pointer, if SPU is finished, start new job
		- Finished method, returns push==pull
		- WaitForSPU method doing a poll with nops

	PPU job manager side:
		- provide RegisterProdConsumerQueue - method, set flags accordingly
	SPU side: 
		- check if it has  a prod/consumer queue
		- if it is part of queue, it must obtain DMA address for job state and of push/pull (static offset to pointer)
		- a flag tells if job state is job state or queue (eaExtJobStateAddress becomes the queue address)
		- lock is obtained when current push/pull ptr is obtained, snooping therefore enabled
		- before FlushCacheComplete, get next parameter packet if multiple packets needs to be processed
		- if all packets were processed, try to write updated pull pointer and set finished state
				if it fails, push pointer was updated during processing time, in that case get lock again and with it the new push pointer
		- loop til the lock and finished state was updated successfully
		- no HandleCallback method, only 1 SPU is permitted to run with queue
		- no write back to external job state, always just one inner packet loop

*/

#ifndef __PROD_CONS_QUEUE_H
#define __PROD_CONS_QUEUE_H
#pragma once

#if defined(PS3)

#if !defined(__SPU__)
	#include <stdio.h>
	#include <stdlib.h>
	#if defined(SUPP_SN)
		#include <lib/libsn.h>
	#endif
#endif

#if !defined(ILINE)
	#define ILINE __attribute__((always_inline)) inline
#endif

//enable to perform dead lock checks or SPU job hang ups
#define CHECK_DEADLOCK
#if defined(CHECK_DEADLOCK)
	#if defined(_DEBUG)
		#define MAX_IT 200000
	#else
		#define MAX_IT 500000
	#endif
#endif

#include <IJobManSPU.h>

namespace NPPU
{
#if !defined(__SPU__)

	//implements atomically: if(*pCompDst == cCompVal){ *pDst = cVal; return 1;} else return 0;
	ILINE int InterlockedCompareExchangeEx
	(
		volatile unsigned int * const pDst,
		const unsigned int cVal,
		volatile unsigned int *const pCompDst,
		const unsigned int cCompVal
	)
	{
		uint32_t ret = 0;//return value
		uint32_t old;//destination for current value
		__asm__
		(
		".loop%=:																# loop start\n"
			"	lwarx   %[old], 0, %[compdst]				# load and reserve\n"
			"	cmpw    %[old], %[comperand]				# (*pCompDst == cCompVal)?\n"			
			"	bne-    .Ldone%=										# break since (*pCompDst != cCompVal)\n"									
			"	stwcx.  %[exchange], 0, %[dst]			# *pDst = cVal\n"
			"	bne-    .loop%=											# loop if lost reservation\n"	
			"	addi		%[ret], %[ret], 1						# ret = 1\n"
			".Ldone%=:														# loop end\n"							
			: [old]"=&r"(old), [ret]"=&r"(ret)
			: [dst]"b"(pDst), [compdst]"b"(pCompDst), [comperand]"r"(cCompVal), [exchange]"r"(cVal)
			: "cc", "memory"
		);
		return ret;//set to 1 in asm if successful
	}

	#define IDLE_NOPS 32				//number of nops performed in each wait loop iteration

	template <class TJobType, unsigned int Size>
	class CProdConsQueue
	{
	public:
		CProdConsQueue();																//default ctor
		~CProdConsQueue();

		void AddPacket(const typename TJobType::packet& crPacket) volatile;	//adds a new parameter packet (job invocation)
		void WaitFinished() volatile;										//wait til all current jobs have been finished and been processed by a SPU
		bool IsEmpty() volatile;												//returns true if queue is empty
//#if defined(DO_SPU_PROFILING)
		const volatile NSPU::NDriver::SJobPerfStats& GetPerfStats() volatile;	//return current performance stats
//#endif
	
	private:
		//------------start of synchronized cache line-------------------------------------------
		volatile unsigned int m_DMAJobState;								//dma job state
		unsigned int pad[3];																//pad to keep 16 byte aligned
		volatile void* m_pPull;															//pull pointer, current ptr to pull packets from (written by SPU)
		unsigned int pad1[3];																//pad to keep 16 byte aligned
		volatile void* m_pPush;															//push pointer, current ptr to push packets into (written by PPU)
		unsigned int m_PullIncrement;												//increment of pull, also accessed by SPU
		unsigned int m_RingBufferStart;											//start of ring buffer (to swap properly by SPU), also accessed by SPU
		unsigned int m_RingBufferEnd;												//end of ring buffer (to swap properly by SPU), also accessed by SPU
		unsigned int pad2[20];															//padding for the first full cache line
		//------------end of synchronized cache line-------------------------------------------
		void* m_pRingBuffer;																//the ring buffer
//#if defined(DO_SPU_PROFILING)
		volatile NSPU::NDriver::SJobPerfStats g_PerfStatsJob;//job specific performance stats location
//#endif
		TJobType m_JobInstance;							//job instance
		int	m_Initialized;

		void Init(const unsigned int cPacketSize) volatile;		//initializes queue

		void* const GetIncrementedPointer() volatile ;//get incremented ptr, takes care of wrapping
	} _ALIGN(128);//align for DMA speed and cache line reservation

#endif //__SPU__

	//keep offsets in sync, do not use offsetof since this would make the struct code fully known to the SPU
	static const unsigned int scJobFinished				= 0x0;											//constant for SPU job is finished
	static const unsigned int scJobRunning				= 0x1;											//constant for SPU job running
	static const unsigned int scProdConsDMAOff		= 0;												//offset of m_DMAJobState
	static const unsigned int scProdConsPullOff		= scProdConsDMAOff + 16;		//offset of m_pPull
	static const unsigned int scProdConsPushOff		= scProdConsPullOff + 16;		//offset of m_pPush
	static const unsigned int scProdConsPullIncr	= scProdConsPushOff + 4;		//offset of m_PullIncrement
	static const unsigned int scProdConsPullStart	= scProdConsPullIncr + 4;		//offset of m_RingBufferStart
	static const unsigned int scProdConsPullEnd		= scProdConsPullStart + 4;	//offset of m_RingBufferEnd
}//NPPU


#if !defined(__SPU__)

template <class TJobType, unsigned int Size>
ILINE NPPU::CProdConsQueue<TJobType, Size>::~CProdConsQueue()
{
	if(m_Initialized && m_pRingBuffer)
		free(m_pRingBuffer);
}

template <class TJobType, unsigned int Size>
ILINE NPPU::CProdConsQueue<TJobType, Size>::CProdConsQueue() : m_Initialized(0)
{
	assert(Size > 2);
	m_DMAJobState = scJobFinished;
	m_JobInstance.RegisterQueue((void*)this);
//#if defined(DO_SPU_PROFILING)
	m_JobInstance.SetJobPerfStats(&g_PerfStatsJob);
//#endif
}

template <class TJobType, unsigned int Size>
ILINE void NPPU::CProdConsQueue<TJobType, Size>::Init(const unsigned int cPacketSize) volatile
{
	assert((cPacketSize & 15) == 0);
	m_pRingBuffer = memalign(128, Size * cPacketSize);
	assert(m_pRingBuffer);
	m_pPush = m_pRingBuffer;
	m_pPull = m_pRingBuffer;
	m_PullIncrement		= cPacketSize;
	m_RingBufferStart	= (unsigned int)m_pRingBuffer;
	m_RingBufferEnd		= m_RingBufferStart + Size * cPacketSize;
	m_Initialized			= 1;
	((TJobType*)&m_JobInstance)->SetParamDataSize(cPacketSize);
}

template <class TJobType, unsigned int Size>
ILINE void NPPU::CProdConsQueue<TJobType, Size>::WaitFinished() volatile 
{
#if defined(CHECK_DEADLOCK)
	volatile int counter = 0;
#endif
	while(m_DMAJobState != scJobFinished)
	{
		for(volatile int i=0; i<IDLE_NOPS; ++i)
			asm volatile("nop");
#if defined(CHECK_DEADLOCK)
		if(counter++ > MAX_IT)
		{
			const unsigned int cPull	= (unsigned int)m_pPull;
			const unsigned int cPush	= (unsigned int)m_pPush;
			const unsigned int cState = m_DMAJobState;
			printf("Deadlock in CProdConsQueue::WaitFinished, pull=0x%08x   push=0x%08x  state=%s\n",cPull, cPush, (cState == scJobRunning)?"Running":"Finished");
#if defined(SUPP_SN)
			printf("   Stopping all SPUs\n");
			const uint32 cNumSPUAllowed = GetIJobManSPU()->GetSPUsAllowed();
			for(uint32 i=0; i<cNumSPUAllowed; ++i)
			{
				NPPU::WriteSPUProbReg(i, NPPU::scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
			}
			while(1){}
#endif
			break;
		}
#endif
	}
}

template <class TJobType, unsigned int Size>
ILINE bool NPPU::CProdConsQueue<TJobType, Size>::IsEmpty() volatile 
{
	return (unsigned int)m_pPush == (unsigned int)m_pPull;
}

//#if defined(DO_SPU_PROFILING)
template <class TJobType, unsigned int Size>
ILINE const volatile NSPU::NDriver::SJobPerfStats& NPPU::CProdConsQueue<TJobType, Size>::GetPerfStats() volatile 
{
	return g_PerfStatsJob;
}
//#endif //DO_SPU_PROFILING

template <class TJobType, unsigned int Size>
ILINE void* const NPPU::CProdConsQueue<TJobType, Size>::GetIncrementedPointer() volatile 
{
	//returns branch free the incremented wrapped aware param pointer
	const unsigned int cNextPtr = (unsigned int)m_pPush + m_PullIncrement;
	//	if(cNextPtr == m_RingBufferEnd)	cNextPtr = m_RingBufferStart;
	const unsigned int cNextPtrMask = (unsigned int)(((int)(cNextPtr - m_RingBufferEnd)) >> 31);
	return (void*)(cNextPtr & cNextPtrMask | m_RingBufferStart & ~cNextPtrMask);
}

template <class TJobType, unsigned int Size>
inline void NPPU::CProdConsQueue<TJobType, Size>::AddPacket(const typename TJobType::packet& crPacket) volatile
{
	const uint32 cPacketSize = crPacket.GetPacketSize();
	if(m_Initialized == 0)
		Init(cPacketSize);

	//get incremented push pointer and check if there is a slot to push it into
	void* const cpNextPushPtr = GetIncrementedPointer();

#if defined(CHECK_DEADLOCK)
	volatile int counter = 0;		
#endif
	while(m_pPush == m_pPull && m_DMAJobState != scJobFinished)//wait til a slot becomes available
	{
		for(volatile int i=0; i<IDLE_NOPS << 1; ++i)
			asm volatile("nop");
#if defined(CHECK_DEADLOCK)
		if(counter++ > MAX_IT)
		{
			printf("Deadlock in CProdConsQueue::AddPacket\n");
	#if defined(SUPP_SN)
			printf("   Stopping all SPUs\n");
			const uint32 cNumSPUAllowed = GetIJobManSPU()->GetSPUsAllowed();
			for(uint32 i=0; i<cNumSPUAllowed; ++i)
			{
				NPPU::WriteSPUProbReg(i, NPPU::scPCSPURunCntl, 0);
				snRawSPUNotifySPUStopped(i);
			}
			while(1){}
	#endif
			break;
		}
#endif
	};
	const vec_uint4 * __restrict pPacketCont = crPacket.GetPacketCont();
	vec_uint4 * __restrict pPushCont				 = (vec_uint4*)m_pPush;
	const uint32 cIters = cPacketSize >> 4;
	for(uint32 i=0;i<cIters; ++i)//copy packet data
		pPushCont[i] = pPacketCont[i];
	const unsigned int cCurJobState = m_DMAJobState;
	if(cCurJobState == scJobFinished)//job is already finished, since only one SPU is active, no need for synchronizations
	{
		m_pPush = cpNextPushPtr;//make visible to SPU
		m_DMAJobState = scJobRunning;
		m_JobInstance.Run();
	}
	else
	{
		//push pointer needs to be atomically updated together with the check if a SPU job has been finished
		//this way it is ensured that the push pointer is either known to the SPU (which is snooping on that cache line) 
		//	or a new job is to be started
		//cCurJobState is suppose to be have state running if we get here
		//we must be sure that push pointer is only updated if the state is not switching to finished
		if(0 == InterlockedCompareExchangeEx((volatile unsigned int*)&m_pPush, (unsigned int)cpNextPushPtr, (volatile unsigned int*)&m_DMAJobState, cCurJobState))
		{
			//job state has switched to finished in the meantime
			assert(m_DMAJobState == scJobFinished);
			m_pPush = cpNextPushPtr;//make visible to SPU
			m_DMAJobState = scJobRunning;
			m_JobInstance.Run();
		}
	}
}

//type macros for SPU jobs
#define PROD_CONS_QUEUE_TYPE(name, size) volatile NPPU::CProdConsQueue<name, (size)>

#endif //__SPU__

#endif //PS3
#endif //__PROD_CONS_QUEUE_H
