#ifndef __SPUMultiThread_h__
#define __SPUMultiThread_h__
#pragma once

//implements all facilities to perform inter PPU/SPU atomic operations commonly used by CryEngine

#if defined(__SPU__) && defined(_SPU_JOB)

#define WRITE_LOCK_VAL (1<<16)

#include <SPU/Memory.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <string.h>
#include <errno.h>
#include <stddef.h>
#include <float.h>
#include <CryAssert.h>

#ifndef _ALIGN
	#define _ALIGN(num) __attribute__ ((aligned(num)))
#endif

#undef DEBUG_BREAK
#define DEBUG_BREAK __asm volatile ("stop 255");

#if defined(DO_SPU_PROFILING)
	#include "JobStructs.h"
	__attribute__((always_inline))
	inline void IncrementAtomicOpsCounter()
	{
		NSPU::NDriver::SJobPerfStats *g_pPerfStats = (NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR;
		++g_pPerfStats->atomicOps;
	}
#else	
	#define IncrementAtomicOpsCounter()
#endif

//enable it to see which atomic ops are called
//#define LOG_OPS 
//#define DISABLE_EVENTS

__attribute__((always_inline))
inline void* __spu_get_atomic_buffer()
{
	return (void*)G_SPU_ATOMIC_BUF;
}

__attribute__((always_inline))
inline int CryInterlockedAdd(int volatile *pDst, const int cVal, const bool cIsAligned = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering CryInterlockedAdd\n");
#endif
	IncrementAtomicOpsCounter();
	SPUSyncAtomicDCache();
	volatile int *pLSBuffer = (int*)__spu_get_atomic_buffer();
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pDst) & 127) >> 2];
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pDst : ((unsigned int)pDst & ~127);
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		*pLSVal += cVal;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
#if defined(LOG_OPS)
	printf("atomic: exiting CryInterlockedAdd\n");
#endif
	return *pLSVal;
}
#define CryInterlockedAdd$VM_ CryInterlockedAdd

__attribute__((always_inline))
inline unsigned int CryInterlockedCompareExchange(volatile unsigned int *pAddr, unsigned int setVal, unsigned int checkVal, const bool cIsAligned = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering CryInterlockedCompareExchange\n");
#endif
	//implements atomically: 	unsigned int res = *pAddr; if (checkVal == res)*pAddr = setVal;	return res;
	unsigned int res;
	IncrementAtomicOpsCounter();
	SPUSyncAtomicDCache();
	volatile unsigned int *pLSBuffer = (unsigned int*)__spu_get_atomic_buffer();
	volatile unsigned int *pLSVal = cIsAligned?pLSBuffer : (volatile unsigned int*)&pLSBuffer[(((unsigned int)pAddr) & 127) >> 2];
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pAddr : ((unsigned int)pAddr & ~127);
	unsigned int llEvent;
	mfc_prep(pLSBuffer, cEAAddr);
	int status = 0;
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		res = *pLSVal;
		if(*pLSVal == checkVal)
		{
			*pLSVal = setVal;
			mfc_putllc_again();
			status = mfc_read_atomic_status();
		}
	}
	while(__builtin_expect(status != 0, false));
#if defined(LOG_OPS)
	printf("atomic: exiting CryInterlockedCompareExchange\n");
#endif
	return res;
}
#define CryInterlockedCompareExchange$VM_ CryInterlockedCompareExchange

__attribute__ ((always_inline))
inline void *CryInterlockedCompareExchangePointer(void *volatile *dst, void *exchange, void *comperand, const bool cIsAligned)
{
	return (void*)CryInterlockedCompareExchange((unsigned volatile*)dst, (unsigned)exchange, (unsigned)comperand, cIsAligned);
}
#define CryInterlockedCompareExchangePointer$VMMM_ CryInterlockedCompareExchangePointer

__attribute__((always_inline))
inline int CryInterlockedExchange(volatile unsigned int *pAddr, unsigned int setVal, const bool cIsAligned = true, const bool cSyncAtomicDCache = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering CryInterlockedExchange\n");
#endif
	//implements atomically: 	unsigned int res = *pAddr; *pAddr = setVal;	return res;
	int res;
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile int *pLSBuffer = (int*)__spu_get_atomic_buffer();
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pAddr) & 127) >> 2];
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pAddr : ((unsigned int)pAddr & ~127);
	unsigned int llEvent;
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		res = *pLSVal;
		*pLSVal = setVal;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
#if defined(LOG_OPS)
	printf("atomic: exiting CryInterlockedExchange\n");
#endif
	return res;
}
#define CryInterlockedExchange$VM_ CryInterlockedExchange

__attribute__((always_inline))
inline unsigned int __spu_recursive_lock(unsigned int *pAddr, unsigned int checkVal, unsigned int val0, unsigned int val1, unsigned int val2, const bool cSyncAtomicDCache = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_recursive_lock\n");
#endif
	//implements atomically: 	unsigned int res = *pAddr; if (checkVal == res){*pAddr = val0;pAddr[1] = val1;pAddr[2] = val2;}	return res;
	//assumes alignment of pAddr
	unsigned int res;
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile unsigned int *pLSBuffer = (unsigned int*)__spu_get_atomic_buffer();
	volatile unsigned int *__restrict pLSVal = pLSBuffer;
	const unsigned int cEAAddr = (unsigned int)pAddr;
	unsigned int llEvent;
	mfc_prep(pLSBuffer, cEAAddr);
	int status = 0;
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		res = *pLSVal;
		if(res == checkVal)
		{
			*pLSVal		= val0;
			pLSVal[1] = val1;
			pLSVal[2] = val2;
			mfc_putllc_again();
			status = mfc_read_atomic_status();
		}
	}
	while(__builtin_expect(status != 0, false));
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_recursive_lock\n");
#endif
	return res;
}
#define __spu_recursive_lock$VM_ __spu_recursive_lock

__attribute__((always_inline))
inline unsigned int __spu_recursive_spinlock(unsigned int *pAddr, unsigned int checkVal, unsigned int val0, unsigned int val1, unsigned int val2, const bool cSyncAtomicDCache = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_recursive_spinlock\n");
#endif
	//implements atomically: 	unsigned int res = *pAddr; if (checkVal == res){*pAddr = val0;pAddr[1] = val1;pAddr[2] = val2;}	return res;
	//assumes alignment of pAddr
	unsigned int res;
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile unsigned int *pLSBuffer = (unsigned int*)__spu_get_atomic_buffer();
	volatile unsigned int *__restrict pLSVal = pLSBuffer;
	const unsigned int cEAAddr = (unsigned int)pAddr;
	unsigned int llEvent;
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//discard previous (or phantom) events, as needed
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
#endif
	int status = 0;
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		if((unsigned int)*pLSVal != checkVal)
		{
			//wait for any write to the reserved cache line
			//snoop on a write to push
#if !defined(DISABLE_EVENTS)
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
#endif
			status = 1;
			continue;
		}
		res = *pLSVal;
		*pLSVal		= val0;
		pLSVal[1] = val1;
		pLSVal[2] = val2;
		mfc_putllc_again();
		status = mfc_read_atomic_status();
	}
	while(__builtin_expect(status != 0, false));
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//disable lock line events
#if !defined(FAST_UNSAFE_LL_ENABLE)
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#endif
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_recursive_spinlock\n");
#endif
	return res;
}
#define __spu_recursive_spinlock$VM_ __spu_recursive_spinlock

__attribute__((always_inline))
inline int __spu_recursive_unlock(int *pAddr, int val0, int val1, int val2, const bool cSyncAtomicDCache = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_recursive_unlock\n");
#endif
	//implements atomically: 	unsigned int res = *pAddr; *pAddr = val0; pAddr[1] = val2;	pAddr[2] = val2;	return res;
	//assumes alignment of pAddr
	int res;
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile int *pLSBuffer = (volatile int*)__spu_get_atomic_buffer();
	volatile int *__restrict pLSVal = pLSBuffer;
	const unsigned int cEAAddr = (unsigned int)pAddr;
	unsigned int llEvent;
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		res = *pLSVal;
		*pLSVal		= val0;
		pLSVal[1] = val1;
		pLSVal[2] = val2;
		mfc_putllc_again();
	}
	while(__builtin_expect(mfc_read_atomic_status() != 0, false));
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_recursive_unlock\n");
#endif
	return res;
}
#define __spu_recursive_unlock$VM_ __spu_recursive_unlock

__attribute__((always_inline))
inline int CryInterlockedIncrement(int volatile *pDst, const bool cIsAligned = false)
{
	return CryInterlockedAdd(pDst, 1, cIsAligned);
}
#define CryInterlockedIncrement$VM_ CryInterlockedIncrement

__attribute__((always_inline))
inline int CryInterlockedDecrement(int volatile *pDst, const bool cIsAligned = false)
{
	return CryInterlockedAdd(pDst, -1, cIsAligned);
}
#define CryInterlockedDecrement$VM_ CryInterlockedDecrement

//important to note: spin locks are currently on the PPU released by calling CryInterlockedAdd
//	this should be replaced by ReleaseSpinLock
//uses lock reservation event to react quickly and to poll less
__attribute__((always_inline))
inline void CrySpinLock(volatile int *pLock, int checkVal, int setVal, const bool cIsAligned = false, const bool cSyncAtomicDCache = true)
{ 
#if defined(LOG_OPS)
	printf("atomic: entering CrySpinLock\n");
#endif
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile int * pLSBuffer = (int*)__spu_get_atomic_buffer();
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pLock) & 127) >> 2];
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pLock : ((unsigned int)pLock & ~127);
	unsigned int llEvent;
	#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
		spu_write_event_mask(0);//discard previous (or phantom) events, as needed
		if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
	#endif//FAST_UNSAFE_LL_ENABLE
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
#endif
	int status = 0;
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		if(*pLSVal != checkVal)
		{
			//wait for any write to the reserved cache line
			//snoop on a write to push
#if !defined(DISABLE_EVENTS)
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
#endif
			status = 1;
			continue;
		}
		*pLSVal = setVal;
		mfc_putllc_again();
		status = mfc_read_atomic_status();
	}
	while(status != 0, false);

#if !defined(DISABLE_EVENTS)
		spu_write_event_mask(0);//disable lock line events
#endif
	#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
		if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
	#endif//FAST_UNSAFE_LL_ENABLE
#if defined(LOG_OPS)
	printf("atomic: exiting CrySpinLock\n");
#endif
}
#define CrySpinLock$VM_ CrySpinLock

__attribute__((always_inline))
inline void CryReleaseSpinLock(volatile int *pLock, int setVal)
{
#if defined(LOG_OPS)
	printf("atomic: entering CryReleaseSpinLock\n");
#endif
	IncrementAtomicOpsCounter();
	//just copy 4 bytes back from a static location
	//hazard case: ReleaseSpinLock is called before last transfer finished, unlikely since a CrySpinLock 
	//	was called previously which takes way longer
	int sSetVal[3] __attribute__ ((aligned(16)));
	const unsigned int cEA = (unsigned int)pLock;
	volatile int *pLS = &sSetVal[(cEA & 15) >> 2];
	*pLS = setVal;
	mfc_putf((volatile void*)pLS, cEA, 4, 0, 0, 0);
	SyncMemory(0);
#if defined(LOG_OPS)
	printf("atomic: exiting CryReleaseSpinLock\n");
#endif
}

__attribute__((always_inline))
inline void *__spu_load_atomic_buffer(void *pAddr)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_load_atomic_buffer\n");
#endif
	//loads a Cache line into the atomic buffer and returns pointer according to offset of pAddr within it
	IncrementAtomicOpsCounter();
	SPUSyncAtomicDCache();
	volatile unsigned char* pLSBuffer = (unsigned char*)__spu_get_atomic_buffer();
	const unsigned int cEAAddr = ((unsigned int)pAddr & ~127);
	mfc_prep(pLSBuffer, cEAAddr);
	mfc_getllar_again();	
	mfc_read_atomic_status();
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_load_atomic_buffer\n");
#endif
	return (void*)&pLSBuffer[((unsigned int)pAddr) & 127];
}
#define __spu_load_atomic_buffer$VM_ __spu_load_atomic_buffer

__attribute__((always_inline))
inline void __spu_load_atomic_cacheline(const void *cpSrcAddrPPU, void* pLS, const bool cSyncAtomicCache)
{
	if(cSyncAtomicCache)
		SPUSyncAtomicDCache();
	mfc_prep(pLS, (unsigned int)cpSrcAddrPPU);
	mfc_getllar_again();	
	mfc_read_atomic_status();
}
#define __spu_load_atomic_cacheline$VML_ __spu_load_atomic_cacheline

//tries to put cache line back to main memory, returns 0 if successful
__attribute__((always_inline))
inline int __spu_try_put_atomic_cacheline()
{
	mfc_putllc_again();
	return mfc_read_atomic_status();
}

__attribute__((always_inline))
inline int __spu_wait_unequal(int *pAddr, int checkVal, const bool cIsAligned = false, const bool cSyncAtomicDCache = true)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_wait_unequal(pAddr=0x%08x, checkVal=%d)\n",(unsigned int)pAddr, checkVal);
#endif
	//waits til *pAddr != checkVal in atomic buffer, returns *pAddr
	IncrementAtomicOpsCounter();
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile int * pLSBuffer = (volatile int*)__spu_get_atomic_buffer();
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pAddr) & 127) >> 2];
	int retVal;
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pAddr : ((unsigned int)pAddr & ~127);
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	unsigned int llEvent;
	spu_write_event_mask(0);//discard previous (or phantom) events, as needed
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
#endif
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		retVal = *pLSVal;
		if(retVal == checkVal)
		{
			//wait for any write to the reserved cache line
			//snoop on a write to push
#if !defined(DISABLE_EVENTS)
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
#endif
			continue;
		}
		break;
	}
	while(1);
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//disable lock line events
#endif
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_wait_unequal\n");
#endif
	return retVal;
}
#define __spu_wait_unequal$VM_ __spu_wait_unequal

__attribute__((always_inline))
inline void __spu_wait_equal(int *pAddr, int checkVal, const bool cIsAligned = false)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_wait_equal\n");
#endif
	//waits til *pAddr == checkVal in atomic buffer, returns *pAddr
	IncrementAtomicOpsCounter();
	SPUSyncAtomicDCache();
	volatile int *pLSBuffer = (int*)__spu_get_atomic_buffer();
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pAddr) & 127) >> 2];
	const unsigned int cEAAddr = cIsAligned?(unsigned int)pAddr : ((unsigned int)pAddr & ~127);
	unsigned int llEvent;
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//discard previous (or phantom) events, as needed
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
#endif
	mfc_prep(pLSBuffer, cEAAddr);
	do
	{
		mfc_getllar_again();
		mfc_read_atomic_status();
		//		spu_dsync();
		if(*pLSVal != checkVal)
		{
			//wait for any write to the reserved cache line
			//snoop on a write to push
#if !defined(DISABLE_EVENTS)
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
#endif
			continue;
		}
		break;
	}
	while(1);
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//disable lock line events
#endif
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
#endif//FAST_UNSAFE_LL_ENABLE
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_wait_equal\n");
#endif
}
#define __spu_wait_equal$VM_ __spu_wait_equal

//return 0 if notified, 1 otherwise
__attribute__((always_inline))
inline unsigned int __spu_timed_wait_unequal(const unsigned int cMicros, int *pAddr, int checkVal, const bool cIsAligned = false, const bool cSyncAtomicDCache = true)
{
	//waits til either time out occurs or *pAddr becomes != checkVal
	//80 decrementer changes per micro second
	if(__builtin_expect(cMicros == 0, false))
		return 1;
#if defined(LOG_OPS)
	printf("atomic: entering __spu_timed_wait_unequal\n");
#endif
	const unsigned int cDecrStart	= spu_readch(SPU_RdDec);
	const unsigned int cDecrStop	= cDecrStart - cMicros * 80;
	if(cSyncAtomicDCache)
		SPUSyncAtomicDCache();
	volatile int *pLSBuffer = (int*)__spu_get_atomic_buffer();
	const int cEAAddr = cIsAligned?(int)pAddr : ((int)pAddr & ~127);
	volatile int *pLSVal = cIsAligned?pLSBuffer : (volatile int*)&pLSBuffer[(((unsigned int)pAddr) & 127) >> 2];
	unsigned int llEvent;
#if !defined(FAST_UNSAFE_LL_ENABLE) && !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//discard previous (or phantom) events, as needed
	if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	{
		llEvent = spu_readch(MFC_RD_EVENT_STATUS);
		spu_writech(MFC_WR_EVENT_ACK, llEvent);
	}
	spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
#endif
	mfc_prep(pLSBuffer, cEAAddr);
	mfc_getllar_again();
	mfc_read_atomic_status();
	if(__builtin_expect(*pLSVal == checkVal, true))
	{
		int cond;
		do 
		{
			const unsigned int cDecrVal = spu_readch(SPU_RdDec);
			//check event channel count, if count != 0, Reload atomic buffer
	#if !defined(DISABLE_EVENTS)
			if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
	#endif
			{
	#if !defined(DISABLE_EVENTS)
				llEvent = spu_readch(MFC_RD_EVENT_STATUS);
				spu_writech(MFC_WR_EVENT_ACK, llEvent);
	#endif
				mfc_getllar_again();
				mfc_read_atomic_status();
				if(__builtin_expect(*pLSVal != checkVal, false))
				{
	#if defined(LOG_OPS)
					printf("atomic: exiting __spu_timed_wait_unequal (0)\n");
	#endif
					return 0;
				}
			}
			cond = cDecrVal > cDecrStop;
			cond &= cDecrVal < cDecrStart;
		} while(cond);
	}
#if !defined(DISABLE_EVENTS)
	spu_write_event_mask(0);//disable lock line events
	#if !defined(FAST_UNSAFE_LL_ENABLE)
		if(__builtin_expect(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false))
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
	#endif//FAST_UNSAFE_LL_ENABLE
#endif
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_timed_wait_unequal\n");
#endif
	return 1;
}
#define __spu_timed_wait_unequal$VM_ __spu_timed_wait_unequal

__attribute__((always_inline))
inline void __spu_microsleep(const unsigned int cMicros)
{
#if defined(LOG_OPS)
	printf("atomic: entering __spu_microsleep\n");
#endif
	//80 decrementer changes per micro second
	const unsigned int cDecrStart	= spu_readch(SPU_RdDec);
	const unsigned int cDecrStop	= cDecrStart - cMicros * 80;
	int cond;
	do 
	{
		const unsigned int cDecrVal = spu_readch(SPU_RdDec);
		cond = cDecrVal > cDecrStop;
		cond &= cDecrVal < cDecrStart;
	} while(cond);
#if defined(LOG_OPS)
	printf("atomic: exiting __spu_microsleep\n");
#endif
}


#endif //__SPU__ && _SPU_JOB
#endif // __SPUMultiThread_h__
