#ifndef __SPUMultiThread_h__
#define __SPUMultiThread_h__
#pragma once

//implements all facilities to perform inter PPU/SPU atomic operations commonly used by CryEngine

#if defined(__SPU__) && defined(_SPU_JOB)

#define WRITE_LOCK_VAL (1<<16)

#include <SPU/Memory.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <string.h>
#include <errno.h>
#include <stddef.h>
#include <float.h>
#include <CryAssert.h>
#if defined(DO_SPU_PROFILING)
	#ifndef _ALIGN
		#define _ALIGN(num) __attribute__ ((aligned(num)))
	#endif
	#include "JobStructs.h"
	__attribute__((always_inline))
	inline void IncrementAtomicOpsCounter()
	{
		NSPU::NDriver::SJobPerfStats *g_pPerfStats = (NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR;
		++g_pPerfStats->atomicOps;
	}
#else	
	inline void IncrementAtomicOpsCounter(){}
#endif
//enable if to measure speed without any synchronization overhead (just for testing)
//#define PERF_TEST_NOT_THREAD_SAFE

__attribute__((always_inline))
inline int CryInterlockedAdd(int volatile *pDst, const int cVal)
{
	IncrementAtomicOpsCounter();
#if defined(PERF_TEST_NOT_THREAD_SAFE)
	int *const pLSDst = __spu_cache_lookup((unsigned int)pDst, 1, 128);
	*pLSDst += cVal;
	return *pLSDst;
#else
	SPUSyncAtomicDCache();
	volatile int lsBuffer[32] __attribute__ ((aligned(128)));
	int status;
	volatile int *pLSVal;
	const unsigned int cEAAddr = (unsigned int)pDst & ~127;
	mfc_getllar_prep(lsBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		pLSVal = &lsBuffer[(((unsigned int)pDst) & 127) >> 2];
		mfc_read_atomic_status();
		//		spu_dsync();
		*pLSVal += cVal;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
	return *pLSVal;
#endif//PERF_TEST_NOT_THREAD_SAFE
}

__attribute__((always_inline))
inline int CryInterlockedIncrement(int volatile *pDst)
{
	return CryInterlockedAdd(pDst, 1);
}

__attribute__((always_inline))
inline int CryInterlockedDecrement(int volatile *pDst)
{
	return CryInterlockedAdd(pDst, -1);
}

//important to note: spin locks are currently on the PPU released by calling CryInterlockedAdd
//	this should be replaced by ReleaseSpinLock
//uses lock reservation event to react quickly and to poll less
__attribute__((always_inline))
inline void CrySpinLock(volatile int *pLock, int checkVal, int setVal)
{ 
	IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	SPUSyncAtomicDCache();
	volatile int lsBuffer[32] __attribute__ ((aligned(128)));
	int status;
	volatile int *pLSVal;
	const unsigned int cEAAddr = (unsigned int)pLock & ~127;
	unsigned int llEvent;
	#if !defined(FAST_UNSAFE_LL_ENABLE)
		spu_write_event_mask(0);//discard previous (or phantom) events, as needed
		if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
	#endif//FAST_UNSAFE_LL_ENABLE
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
	mfc_getllar_prep(lsBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		pLSVal = &lsBuffer[(((unsigned int)pLock) & 127) >> 2];
		mfc_read_atomic_status();
		//		spu_dsync();
		if(*pLSVal != checkVal)
		{
			//wait for any write to the reserved cache line
			//snoop on a write to push
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
			continue;
		}
		*pLSVal = setVal;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
	spu_write_event_mask(0);//disable lock line events
	#if !defined(FAST_UNSAFE_LL_ENABLE)
		if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
	#endif//FAST_UNSAFE_LL_ENABLE
#endif//PERF_TEST_NOT_THREAD_SAFE
}

__attribute__((always_inline))
inline void CryReleaseSpinLock(volatile int *pLock, int setVal)
{
	IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	//just copy 4 bytes back from a static location
	//hazard case: ReleaseSpinLock is called before last transfer finished, unlikely since a CrySpinLock 
	//	was called previously which takes way longer
	static int sSetVal[3] __attribute__ ((aligned(16)));
	const unsigned int cEA = (unsigned int)pLock;
	volatile int *pLS = &sSetVal[(cEA & 15) >> 2];
	*pLS = setVal;
	mfc_putf((volatile void*)pLS, cEA, 4, 0, 0, 0);
#endif//PERF_TEST_NOT_THREAD_SAFE
}

//the usual critical section implementations are not suited for SPU since we need to have mutual ex. for PPU and SPU
//	this requires a lock as well as the cache line reservation
//make the common impls. not compile and provide extended versions instead
#define CryCreateCriticalSection DO_NOT_USE_CryCreateCriticalSection_ON_SPU
#define CryCreateCriticalSectionGlobal DO_NOT_USE_CryCreateCriticalSectionGlobal_ON_SPU
#define CryDeleteCriticalSection DO_NOT_USE_CryDeleteCriticalSectionGlobal_ON_SPU
#define CryDeleteCriticalSectionGlobal DO_NOT_USE_CryDeleteCriticalSectionGlobal_ON_SPU
#define CryTryCriticalSection USE_CryTryCriticalSectionGlobal_FOR_SPU_SHARING
#define CryLeaveCriticalSection USE_CryLeaveCriticalSectionGlobal_FOR_SPU_SHARING
#define CCryMutex USE_CCryMutexGlobal_INSTEAD_OF_CCryMutex_ON_SPU

//global critical sections are mutexes created together with its own volatile cache line
//the PPU version basically first locks a mutex and then spins on the reservation
__attribute__((always_inline))
inline void CryEnterCriticalSectionGlobal(void *cs)
{
	IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	assert(cs && (unsigned int)cs > 256*1024 && ((unsigned int)cs & 127) == 0);
	const unsigned int cResLineEA = (unsigned int)cs - 128;//address of reservation
	SPUSyncAtomicDCache();
	volatile int lsBuffer[32] __attribute__ ((aligned(128)));
	int status;
	mfc_getllar_prep(lsBuffer, cResLineEA);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		if(lsBuffer[0] != 0)
			continue;
		lsBuffer[0] = 1;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
#endif//PERF_TEST_NOT_THREAD_SAFE
}

__attribute__((always_inline))
inline bool CryTryCriticalSectionGlobal(void *cs)
{
	IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	assert(cs && (unsigned int)cs > 256*1024 && ((unsigned int)cs & 127) == 0);
	const unsigned int cResLineEA = (unsigned int)cs - 128;//address of reservation
	SPUSyncAtomicDCache();
	volatile int lsBuffer[32] __attribute__ ((aligned(128)));
	int status;
	mfc_getllar_prep(lsBuffer, cResLineEA);
	mfc_getllar_again();
	mfc_read_atomic_status();
	//		spu_dsync();
	if(lsBuffer[0] != 0)
		return false;
	lsBuffer[0] = 1;
	mfc_putllc_again();
	return(mfc_read_atomic_status() == 0);
#else
	return true;
#endif//PERF_TEST_NOT_THREAD_SAFE
}

__attribute__((always_inline))
inline void CryLeaveCriticalSectionGlobal(void *cs)
{
	IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	//simply transfer unconditional a 0 back
	assert(cs && (unsigned int)cs > 256*1024 && ((unsigned int)cs & 127) == 0);
	static int sLeavBuf[4] __attribute__ ((aligned(16))) = {0,0,0,0};
	const unsigned int cResLineEA = (unsigned int)cs - 128;//address of reservation
	mfc_putf((volatile void*)sLeavBuf, cResLineEA, 16, 0, 0, 0);
#endif//PERF_TEST_NOT_THREAD_SAFE
}

struct ReadLock
{
	__attribute__((always_inline))
	inline ReadLock(volatile int &rw)
	{
		IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		prw = &rw;
		SPUSyncAtomicDCache();
		volatile int lsBuffer[32] __attribute__ ((aligned(128)));
		int status;
		volatile int *pLSVal;
		const unsigned int cEAAddr = (unsigned int)prw & ~127;
		//implement: CryInterlockedAdd(prw=&rw,1);
		mfc_getllar_prep(lsBuffer, cEAAddr);
		do
		{
			mfc_getllar_again();
			pLSVal = &lsBuffer[(((unsigned int)prw) & 127) >> 2];
			mfc_read_atomic_status();
			//		spu_dsync();
			*pLSVal += 1;
			mfc_putllc_again();
		}
		while(__builtin_expect(mfc_read_atomic_status() != 0, false));
		//implement: volatile char *pw=(volatile char*)&rw+2; for(;*pw;);
		if(__builtin_expect(*pLSVal & WRITE_LOCK_VAL, false))
		{
			unsigned int llEvent;
#if !defined(FAST_UNSAFE_LL_ENABLE)
			spu_write_event_mask(0);//discard previous (or phantom) events, as needed
			if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
			{
				llEvent = spu_readch(MFC_RD_EVENT_STATUS);
				spu_writech(MFC_WR_EVENT_ACK, llEvent);
			}
#endif//FAST_UNSAFE_LL_ENABLE
			spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
			do
			{
				mfc_getllar_again();
				mfc_read_atomic_status();
				//		spu_dsync();
				if(*pLSVal & WRITE_LOCK_VAL)
				{
					//wait for any write to the reserved cache line, snoop on a write to rw
					llEvent = spu_readch(MFC_RD_EVENT_STATUS);
					spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
					continue;
				}
				break;
			}
			while(1);
			spu_write_event_mask(0);//disable lock line events
#if !defined(FAST_UNSAFE_LL_ENABLE)
			if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
			{
				llEvent = spu_readch(MFC_RD_EVENT_STATUS);
				spu_writech(MFC_WR_EVENT_ACK, llEvent);
			}
#endif//FAST_UNSAFE_LL_ENABLE
		}
#endif//PERF_TEST_NOT_THREAD_SAFE
	}
	
	__attribute__((always_inline))
	inline ~ReadLock()
	{
		IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw,-1);
#endif//PERF_TEST_NOT_THREAD_SAFE
	}
private:
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
	volatile int *prw;
#endif//PERF_TEST_NOT_THREAD_SAFE
};

struct WriteLock
{
	__attribute__((always_inline))
	inline WriteLock(volatile int &rw)
	{
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CrySpinLock(&rw, 0, WRITE_LOCK_VAL); 
		prw=&rw;
#endif//PERF_TEST_NOT_THREAD_SAFE
	}

	__attribute__((always_inline))
	inline ~WriteLock() 
	{ 
		IncrementAtomicOpsCounter();
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw,-WRITE_LOCK_VAL); 
#endif//PERF_TEST_NOT_THREAD_SAFE
	}
private:
	volatile int *prw;
};

struct ReadLockCond
{
	__attribute__((always_inline))
	inline ReadLockCond(volatile int &rw, int bActive)
	{
		IncrementAtomicOpsCounter();
		assert((unsigned int)this < 256*1024);
		if(bActive)
		{
			bActivated = 1;
	#if !defined(PERF_TEST_NOT_THREAD_SAFE)
			prw = &rw;
			SPUSyncAtomicDCache();
			volatile int lsBuffer[32] __attribute__ ((aligned(128)));
			int status;
			volatile int *pLSVal;
			const unsigned int cEAAddr = (unsigned int)prw & ~127;
			//implement: CryInterlockedAdd(prw=&rw,1);
			mfc_getllar_prep(lsBuffer, cEAAddr);
			do
			{
				mfc_getllar_again();
				pLSVal = &lsBuffer[(((unsigned int)prw) & 127) >> 2];
				mfc_read_atomic_status();
				//		spu_dsync();
				*pLSVal += 1;
				mfc_putllc_again();
			}
			while(__builtin_expect(mfc_read_atomic_status() != 0, false));
			//implement: volatile char *pw=(volatile char*)&rw+2; for(;*pw;);
			if(__builtin_expect(*pLSVal & WRITE_LOCK_VAL, false))
			{
				unsigned int llEvent;
	#if !defined(FAST_UNSAFE_LL_ENABLE)
				spu_write_event_mask(0);//discard previous (or phantom) events, as needed
				if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
				{
					llEvent = spu_readch(MFC_RD_EVENT_STATUS);
					spu_writech(MFC_WR_EVENT_ACK, llEvent);
				}
	#endif//FAST_UNSAFE_LL_ENABLE
				spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
				do
				{
					mfc_getllar_again();
					mfc_read_atomic_status();
					//		spu_dsync();
					if(*pLSVal & WRITE_LOCK_VAL)
					{
						//wait for any write to the reserved cache line, snoop on a write to rw
						llEvent = spu_readch(MFC_RD_EVENT_STATUS);
						spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
						continue;
					}
					break;
				}
				while(1);
				spu_write_event_mask(0);//disable lock line events
	#if !defined(FAST_UNSAFE_LL_ENABLE)
				if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
				{
					llEvent = spu_readch(MFC_RD_EVENT_STATUS);
					spu_writech(MFC_WR_EVENT_ACK, llEvent);
				}
	#endif//FAST_UNSAFE_LL_ENABLE
			}
		}
		else
		{
			bActivated = 0;
		}
#endif//PERF_TEST_NOT_THREAD_SAFE
	}

	__attribute__((always_inline))
	inline void SetActive(int bActive=1) {bActivated = bActive;}

	__attribute__((always_inline))
	inline void Release() 
	{ 
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw,-bActivated);
#endif//PERF_TEST_NOT_THREAD_SAFE
	}

	__attribute__((always_inline))
	inline ~ReadLockCond()
	{
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw, -bActivated);
#endif//PERF_TEST_NOT_THREAD_SAFE
	}

private:
	volatile int *prw;
	int bActivated;
};

struct WriteLockCond
{
	__attribute__((always_inline))
	inline WriteLockCond(volatile int &rw,int bActive=1)
	{
		assert((unsigned int)this < 256*1024);
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		if (bActive)
			CrySpinLock(&rw, 0, iActive = WRITE_LOCK_VAL);
		else 
			iActive = 0;
		prw = &rw; 
#endif//PERF_TEST_NOT_THREAD_SAFE
	}
	
	__attribute__((always_inline))
	inline ~WriteLockCond() 
	{
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw,-iActive);
#endif//PERF_TEST_NOT_THREAD_SAFE
	}

	__attribute__((always_inline))
	inline void SetActive(int bActive=1) { iActive = -bActive & WRITE_LOCK_VAL; }
	
	__attribute__((always_inline))
	inline void Release() 
	{ 
#if !defined(PERF_TEST_NOT_THREAD_SAFE)
		CryInterlockedAdd(prw,-iActive);
#endif//PERF_TEST_NOT_THREAD_SAFE
	}
private:
	volatile int *prw;
	int iActive;
};


#endif //__SPU__ && _SPU_JOB
#endif // __SPUMultiThread_h__
