/* 
	definitions for SPU
	part of precompiled headers, include important headers
*/

#ifndef __SPU_H
#define __SPU_H
#pragma once

#if defined(PS3) && !defined __CRYCG__

#undef SPU_ASSERT_STRING
#define SPU_ASSERT_STRING "spu assert: "	//this string is checked in interrupt printf routine handler on PPU

//#define SUPP_PRINTF
#if !defined(SUPP_PRINTF)
	#define SUPP_OLD_PRINTF
#endif

//keep in sync in DoCacheLookup_spu.S
#define STACK_WARNING_VAL 512

#define _DATASEC __attribute__((section("data")))

//enable printf (if not already) for hazard checks and assertion of all kind
#if (defined(_DEBUG) || !defined(_NO_SPU_CACHE_HAZ_CHECK) || !defined(_NO_SPU_CACHE_ASSERT) || !defined(_NO_SPU_ASSERT) || defined(CHECK_BUB_HAZARD)) && !defined(SUPP_PRINTF) && !defined(SUPP_OLD_PRINTF)
	//need printf statements for bubble hazards
//	#define SUPP_PRINTF
		#define SUPP_OLD_PRINTF
#endif

#if defined(__SPU__)

	#if defined(SUPP_SN)
		#define SPU_DEBUG_HALT __asm volatile ("stop 255")
	#else
		#define SPU_DEBUG_HALT __builtin_spu_hcmpeq_0(0, 0); while(1)
	#endif
	#include <spu_intrinsics.h>

	#undef assert
	#undef spu_assert
	#if defined(_NO_SPU_ASSERT)
		#define spu_assert(cond) do{}while (false)
	#else 
		#define spu_assert(cond) \
			do \
			{ \
				if (__builtin_expect(!(cond), 0)) \
				{ \
					printf("%s(%s)  in %s (%s : %d)\n",SPU_ASSERT_STRING, #cond, __func__, __FILE__, __LINE__); \
					SPU_DEBUG_HALT; \
				} \
			} while (false)
	#endif//_NO_SPU_ASSERT
	#define assert(cond) spu_assert(cond)

	#undef cellDmaAssert
	#ifdef NO_CELL_DMA_ASSERT
		#define cellDmaAssert(cond)
	#else 
		#define cellDmaAssert(cond) spu_assert(cond)
	#endif //NO_CELL_DMA_ASSERT
#endif//__SPU__

#if defined(__SPU__)
	#include "Cache/CacheDefs_spu.h"
	//register definitions, used to save loads at vital places

	//mask for EA->set generation (number of sets -1) << 4
	register vec_uint4 g_SetMaskSL4 __asm__ ("$78");

	//pointer to cache memory
	register vec_uint4* __restrict g_pSPUCache __asm__ ("$77");

	//pointer to cache directory
	register vec_uint4* __restrict g_pSPUCacheDir __asm__ ("$76");

	//cache LRU counter, each access gets the incremented most recent value set
	//therefore is the slot with the lowest value to be replaced (only first slot is used)
	//static register assignment to make fast increments for __spu_cache_touch
	register vec_uint4 g_LRUCounter __asm__ ("$75");//mapped the same on driver side

	//cache constants used by asm version of DoLookupCache, initialized by InitCache
	register vec_uint4 g_SPUCacheLineOffValues __asm__ ("$74");	//special line offset constant (384, 256, 128, 0)

	//pointer to cache LRU control
	register vec_uint4* __restrict g_pSPUCacheLRUCtrl __asm__ ("$73");

#if !defined(__CRYCG__) && !defined(_SPU_JOB)
	//cache constants used by asm version of DoLookupCache, initialized by InitCache
	register vec_uint4 g_LRUCounterIncr __asm__ ("$72");	//incremented LRU to set
#endif
	//the 16 bytes of the upcoming cross bubble call
	//do not change the register, it is looked up in BubbleGen
	//reused for resolving static/global vars
	register vec_ushort8 g_CrossBubbleData __asm__ ("$71");

#if defined(DO_SPU_PROFILING)
	//id passed to cache lookup, reused for passing current atomic EA, partial lifetime is always just within the lookup/miss
	register vec_uint4 g_ProfID __asm__ ("$70");
#endif

#endif

//max num of supported cache lookup id's
//referenced in cacheanalyser
#define PROF_ID_RESERVED 8
#define MAX_PROF_ID (1024)

//internally used prof id's
#define PROF_ID_MEMSET_VM (MAX_PROF_ID - PROF_ID_RESERVED)
#define PROF_ID_MEMCPY_VMM (MAX_PROF_ID - PROF_ID_RESERVED + 1)
#define PROF_ID_MEMCPY_VML (MAX_PROF_ID - PROF_ID_RESERVED + 2)
#define PROF_ID_PPU_ALLOC (MAX_PROF_ID - PROF_ID_RESERVED + 3)

#define SPU_PRINTF_BUF_SIZE 512

#if defined(__SPU__)

	//defined in SPUJob.h otherwise
	#if !defined(_SPU_JOB)
		#if defined(ENABLE_HAZARD_MODE) || !defined(_NO_SPU_CACHE_HAZ_CHECK)
			#include <spu_printf.h>
			#define spu_CheckCacheHazard(cond) \
				do \
				{ \
					if (__builtin_expect(!(cond), 0)) \
					{ \
						spu_printf("%s cache hazard: (%s) in %s line %d\n",SPU_ASSERT_STRING, #cond, __FILE__, __LINE__); \
						SPU_DEBUG_HALT; \
					} \
				}\
				while (false)
		#else
			#define spu_CheckCacheHazard(cond) do{}while (false)
		#endif //ENABLE_HAZARD_MODE
	#endif //_SPU_JOB

	//macro used in cache lookup macros
	#define MASK_SET(ea) (ea & spu_extract(g_SetMaskSL4, 0))
	
	#if !defined(SUPP_PRINTF) && !defined(SUPP_OLD_PRINTF)
		#define printf(...) 
	#endif

	//enable DMA debugging info
	#if defined(SUPP_PRINTF) || defined(SUPP_OLD_PRINTF)
//		#define DEBUG_PRINTF printf
		#define DEBUG_PRINTF
	#else
		#define DEBUG_PRINTF
	#endif//SUPP_PRINTF

	//if enabled, polling is used to sync a transfer
	#if defined(_DEBUG)
		#define MFC_SYNC_BY_POLLING
	#endif
	#if defined(MFC_SYNC_BY_POLLING)
		#define MEASURE_TIMEOUT
	#endif

#endif //__SPU__

//shift for spu interrupts
#ifndef EVENT_PORT_SHIFT
	#define EVENT_PORT_SHIFT 24
#endif
//interrupt port definitions, specifies event to handle
#ifndef EVENT_PRINTF_PORT
	#define EVENT_PRINTF_PORT 1U
#endif
#define EVENT_CALLBACK_PORT 2U
#define EVENT_MEMCHANGE_PORT 3U
#define EVENT_PRINTF_PORT_CUSTOM 4U
#define EVENT_PROF_PORT 5U

//DMA tags
//	tag 0..3 are used for async cache write back transfers, do only possibly reuse outside jobs
//	tag 0..3 also used for:
//		tag 0: job in GetJobFromJobQueue(), 
//		tag 1: info block in GetJobFromJobQueue()
//		tag 2: transfers signal2 register to other SPUs
//		tag 3: info block  for depending jobs GetJobFromJobQueue, async cache write back transfer
#define g_scDMAPPUMemTag				4	//tag for PPU memory request

#define g_scDMAListTag					5	//tag for input and output memory
#define g_scDMAPPUPrintfTag			5	//tag for transfer of printf buffers
#define g_scDMAPPUProfTag				5	//tag for transfer of cache profiling buffer, also used for small pre memcpy

#define g_scDMAJobTag						6
#define g_scDMAOutputTag				6
#define g_MemCpyTempTag					6	//temp tag for memcpy

#define g_MemCpyTempTag1				7	//some temp tag for memcpy

//keep bubble tags in adjacent order, do not use for anything else
//do not change values, accessed in miss handler asm code too
#define g_scDMABubbleTag0				8		//tag for transfer of bubble 0
#define g_scDMABubbleTag1				9	  //tag for transfer of bubble 1
#define g_scDMABubbleTag2				10	//tag for transfer of bubble 2
#define g_scDMABubbleTag3				11	//tag for transfer of bubble 3
//tags 12..27 are used for memcpy (syncd in place, other uses possible)
#define g_MemCpyTag							12	//tag for memset/memcpy between local/main mem toggled by memset/memcpy
#define USER_DMA_TAG_BASE				12	//tag available to custom usage
#define USER_DMA_TAG_CNT				16	//num DMA tags available to custom usage (USER_DMA_TAG_BASE .. USER_DMA_TAG_BASE+(USER_DMA_TAG_CNT-1))
//	tag 31..28 are used for cache DMA prefetching, do not use for anything else

//dma transfer rate per decrementer tick in bytes
#define BYTES_PER_DECR_TICK 160

#define JOB_SPAWN_STATE_WORD 0
#define JOB_SPAWN_PUSH_WORD 3

namespace NSPU
{
	//given a size value will make this 16 byte aligned
	__attribute__((always_inline))
	inline const unsigned int AlignSize16(const unsigned int cSize)
	{
		return (cSize + 0xF) & ~0xF;
	}

	__attribute__((always_inline))
	inline const unsigned int AlignSize128(const unsigned int cSize)
	{
		return (cSize + 127) & ~127;
	}

	//given a size value will make this 4 byte aligned
	__attribute__((always_inline))
	inline const unsigned int AlignSize4(const unsigned int cSize)
	{
		return (cSize + 0x3) & ~0x3;
	}

	//given a size value will make this 8 byte aligned
	__attribute__((always_inline))
	inline const unsigned int AlignSize8(const unsigned int cSize)
	{
		return (cSize + 0x7) & ~0x7;
	}

	typedef void *TAddrLS;		//typedef for an SPU local store address on SPU

	__attribute__((always_inline))
	inline void Align16(TAddrLS& rAddr)
	{
		rAddr = (TAddrLS)(((unsigned int)rAddr + 0xF) & ~0xF);
	}

	__attribute__((always_inline))
	inline void Align16(volatile TAddrLS& rAddr)
	{
		rAddr = (TAddrLS)(((unsigned int)rAddr + 0xF) & ~0xF);
	}

#if defined(__SPU__)
	__attribute__((always_inline))
	inline void IncrementPointer(TAddrLS& rpPtr, const unsigned int cIncrementInBytes)
	{
		rpPtr = (TAddrLS)((unsigned int)rpPtr + cIncrementInBytes);
	}

	__attribute__((always_inline))
	inline void IncrementPointer(volatile TAddrLS& rpPtr, const unsigned int cIncrementInBytes)
	{
		rpPtr = (volatile TAddrLS)((unsigned int)rpPtr + cIncrementInBytes);
	}

	__attribute__((always_inline))
	inline const unsigned int GetStackAddress()
	{
		register unsigned int __sp __asm__("$sp");
		return __sp;
	}

	__attribute__((always_inline))
	inline const unsigned int WaitSignal()
	{
		return spu_readch(SPU_RdSigNotify1);
	}

	__attribute__((always_inline))
	inline const vec_uint4 ShiftRight128(const vec_uint4 cVec, const unsigned short cBitCount)
	{
		return spu_rlmaskqw(spu_rlmaskqwbytebc(cVec, 7-cBitCount), -cBitCount);
	}

	__attribute__((always_inline))
	inline const vec_uint4 ShiftLeft128(const vec_uint4 cVec, const unsigned short cBitCount)
	{
		return spu_slqw(spu_slqwbyte(cVec, cBitCount >> 3), cBitCount);
	}

	//returns the index 0..3 of the lowest value in 
	__attribute__((always_inline))
	inline const unsigned int GetReplIndex(const vec_uint4 cVal4)
	{
		//third and fourth byte within the cIndexInSet-word is used
		//returns the index with the lowest LRU value (least recently used due to lowest incrementer value)
		const vec_uint4 cVal0							= cVal4;
		const vec_uint4 cVal1							= spu_rlqwbyte(cVal4, 4);
		const vec_uint4 cVal2							= spu_rlqwbyte(cVal4, 8);
		const vec_uint4 cVal3							= spu_rlqwbyte(cVal4, 12);
		const vec_uint4 cCmpVec01					= spu_cmpgt(cVal0, cVal1);
		const vec_uint4 cCmpVec23					= spu_cmpgt(cVal2, cVal3);
		const vec_uint4 cCmpSelRes01			= spu_sel(cVal0, cVal1, cCmpVec01);
		const vec_uint4 cCmpIndexRes01		= spu_sel(spu_promote((unsigned int)0,0), spu_promote((unsigned int)1,0), cCmpVec01);//spu_and(cCmpVec01, 1)
		const vec_uint4 cCmpSelRes23			= spu_sel(cVal2, cVal3, cCmpVec23);
		const vec_uint4 cCmpIndexRes23		= spu_sel(spu_promote((unsigned int)2,0), spu_promote((unsigned int)3,0), cCmpVec23);
		const vec_uint4 cCmpVec0123				= spu_cmpgt(cCmpSelRes01, cCmpSelRes23);
		const vec_uint4 cCmpIndexRes0123	= spu_sel(cCmpIndexRes01, cCmpIndexRes23, cCmpVec0123);
		return spu_extract(cCmpIndexRes0123, 0);
	}

	//class and macro to save/restore cache regs
	class __cache_save_regs_helper
	{
	public:
		__cache_save_regs_helper() : m_SetMaskSL4(g_SetMaskSL4), m_pSPUCache(g_pSPUCache), 
			m_pSPUCacheDir(g_pSPUCacheDir), m_LRUCounter(g_LRUCounter), m_SPUCacheLineOffValues(g_SPUCacheLineOffValues),
			m_pSPUCacheLRUCtrl(g_pSPUCacheLRUCtrl){}
		~__cache_save_regs_helper()
		{
			g_SetMaskSL4						= m_SetMaskSL4;
			g_pSPUCache							= (vec_uint4* __restrict)m_pSPUCache;
			g_pSPUCacheDir					= (vec_uint4* __restrict)m_pSPUCacheDir;
			g_LRUCounter						= m_LRUCounter;
			g_SPUCacheLineOffValues = m_SPUCacheLineOffValues;
			g_pSPUCacheLRUCtrl			= (vec_uint4* __restrict)m_pSPUCacheLRUCtrl;
		}
	private:
		volatile const vec_uint4 m_SetMaskSL4;
		volatile const vec_uint4* const  __restrict m_pSPUCache;
		volatile const vec_uint4* const __restrict m_pSPUCacheDir;
		volatile const vec_uint4 m_LRUCounter;
		volatile const vec_uint4 m_SPUCacheLineOffValues;
		volatile const vec_uint4* const __restrict m_pSPUCacheLRUCtrl;
	};

	#define __cache_save_regs(...) ({ \
		NSPU::__cache_save_regs_helper __saved_regs; \
		__VA_ARGS__; })
#endif//__SPU__
}//NSPU


#endif //PS3 && !__CRYCG__

#if defined(__SPU__)
__attribute__((always_inline))
inline const unsigned int rdtsc()
{
	return spu_readch(SPU_RdDec);
}

extern "C" 
{
	namespace std
	{
		float powf(float, float);
		__attribute__((always_inline))
		inline double pow(double x, double y)
		{
			return (double)powf((float)x, (float)y);
		}
	}
}
#endif

#endif //__SPU_H
