/* 
	all required headers for a job
	this must never be included from a header file or non SPU job (_SPU_JOB will be defined from the makefile)
*/
#ifndef __SPU_JOB_H
#define __SPU_JOB_H
#pragma once

#if defined(PS3)

#if defined(__SPU__) 

#include <SPU/SPU.h>
#include <IJobManSPU.h>
#include <stdarg.h>

//------------------------------------software cache---------------------------------------------------

#if defined(_SPU_JOB)
typedef void (*const TPPUAllocFunc)(void*&, const unsigned int);
typedef void (*const TPPUFreeFunc)(void*);
typedef const int (*const TSPUResolveGlobalVarFunc)(const int);
typedef void (*const TSPUPrintfHandler)(const char*, ...);
#if defined(ENABLE_HAZARD_MODE)
	typedef void* (*const TSPUDoLookupCacheFunc)(const unsigned int, const unsigned int, const int);
#else
	typedef void* (*const TSPUDoLookupCacheFunc)(const unsigned int, const unsigned int, const unsigned int, const unsigned int, const unsigned int, const unsigned int);
#endif//ENABLE_HAZARD_MODE
typedef int (*const TSPUCacheAssertFunc)(const unsigned int, const unsigned int, const char*, const unsigned int);
typedef void (*const TSPUSetActiveBubblesFunc)(const int, const int, const int, const int);
typedef void (*const TSPUSyncAtomicDCache)();
typedef void (*const TSPUStartAtomicWrite)();
typedef void (*const TLogMessageV)(const char* szFormat, va_list args);
typedef void (*const TSPUFlushCacheFunc)(const bool);

//external functions coming from libDriverDMA.a
extern void* Memset__VM(void* pDest, int cVal, unsigned int size);
extern void* Memcpy__VLM(void* const pDest, const void* const cpSrc, const unsigned int cSize);
extern void* Memcpy__VML(void* const pDest, const void* const cpSrc, const unsigned int cSize);
extern void* Memcpy__VMM(void* const pDest, const void* const cpSrc, const unsigned int cSize);
extern void FlushCacheRange(const unsigned int cEAFrom, const unsigned int cSize, const bool cDoWriteBack);
#if defined(_SPU_JOB)
	#define PPU_JOB_DEF(name) < name ## _ppu_include.h>
	namespace NPPU{class CSPUJobDel;}
	extern const NPPU::EAddJobRes RunSPUJob
	(
		const NPPU::CSPUJobDel&,
		const unsigned int, 
		const unsigned char, 
		const unsigned int,
		const bool,
		const unsigned short
	);
#endif

//address of memory manager, will be updated by makefile
//do not edit these lines by hand, accessed in SPUDriverMemManagerParser and BubbleGen
//-------------------------------------------------------------------------------------------------

#if defined(_DEBUG)

static TPPUAllocFunc PPUAlloc_func = (const TPPUAllocFunc)26768;
static TPPUFreeFunc PPUFree_func = (const TPPUFreeFunc)29024;
static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)37248;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)37120;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)26472;
static TSPUSetActiveBubblesFunc SPUSetActiveBubblesFunc = (const TSPUSetActiveBubblesFunc)27968;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)26208;
TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)40784;
TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)26248;
TLogMessageV SPULogMessageV = (TLogMessageV)26128;
TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)33832;
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 0
#define G_SPU_CACHE_PREF_DIR_ADDR 43472
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 43488
#define G_SPU_CACHE_PREF_BUF_ADDR 42112
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 43456
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 43440
#define G_SPU_CACHE_CUR_ATOMIC_EA 43504
#define G_SPU_CACHE_SHADOW_CACHE 43552
#define G_SPU_INFO_BLOCK 41856
#define G_SPU_BUB_DIR_INFO 43584
#define G_SPU_JOB_SPAWN_REG 43184
#define G_SPU_DEBUG_STATE 43216
#define G_SPU_NUM_SETS 43568
#define G_SPU_LOG 43424

#elif defined(DO_SPU_PROFILING)

static TPPUAllocFunc PPUAlloc_func = (const TPPUAllocFunc)9664;
static TPPUFreeFunc PPUFree_func = (const TPPUFreeFunc)8656;
static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)16512;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)16256;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)0;
static TSPUSetActiveBubblesFunc SPUSetActiveBubblesFunc = (const TSPUSetActiveBubblesFunc)7952;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)7536;
TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)10808;
TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)7680;
TLogMessageV SPULogMessageV = (TLogMessageV)7664;
TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)14136;
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 20176
#define G_SPU_CACHE_PREF_DIR_ADDR 20560
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 20576
#define G_SPU_CACHE_PREF_BUF_ADDR 19072
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 20544
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 20528
#define G_SPU_CACHE_CUR_ATOMIC_EA 32912
#define G_SPU_CACHE_SHADOW_CACHE 32960
#define G_SPU_INFO_BLOCK 18816
#define G_SPU_BUB_DIR_INFO 32992
#define G_SPU_JOB_SPAWN_REG 20272
#define G_SPU_DEBUG_STATE 20304
#define G_SPU_NUM_SETS 32976
#define G_SPU_LOG 20512  

#else

static TPPUAllocFunc PPUAlloc_func = (const TPPUAllocFunc)8776;
static TPPUFreeFunc PPUFree_func = (const TPPUFreeFunc)7840;
static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)10240;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)10112;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)0;
static TSPUSetActiveBubblesFunc SPUSetActiveBubblesFunc = (const TSPUSetActiveBubblesFunc)7176;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)6912;
TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)9800;
TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)13248;
TLogMessageV SPULogMessageV = (TLogMessageV)6936;
TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)12672;
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 0
#define G_SPU_CACHE_PREF_DIR_ADDR 16576
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 16592
#define G_SPU_CACHE_PREF_BUF_ADDR 15232
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 16560
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 16544
#define G_SPU_CACHE_CUR_ATOMIC_EA 16608
#define G_SPU_CACHE_SHADOW_CACHE 16656
#define G_SPU_INFO_BLOCK 14976
#define G_SPU_BUB_DIR_INFO 16688
#define G_SPU_JOB_SPAWN_REG 16304
#define G_SPU_DEBUG_STATE 16336
#define G_SPU_NUM_SETS 16672
#define G_SPU_LOG 16528          

#endif//_DEBUG

//------------------------------------__spu_cache_lookup/__cache_assert-------------------------------
	
//cache lookup function, ea is PPU address, keep in sync with declaration in SPUMemManager.cpp
#if defined(ENABLE_HAZARD_MODE)
	#define CheckIsPresent(ea) \
		spu_CheckCacheHazard(0 <= \
		GetCacheIndexNum(SetCache4WayLookup(GetCacheSetIndex((uint32)(ea)), \
		spu_splats((uint32)(ea) & ~scSPUCacheLineSizeMask))))
	#define __spu_cache_lookup(ea, LRUIncr, PrefDiff) SPUDoLookupCacheChecked_func(ea, LRUIncr, PrefDiff)
	#define __spu_cache_lookup_miss(ea, LRUIncr, PrefDiff) SPUDoLookupCacheChecked_func(ea, LRUIncr, PrefDiff)
#else
	#define CheckIsPresent(ea)
	#define __spu_cache_lookup(ea, LRUIncr, PrefDiff) SPUDoLookupCache_func((uint32)(ea), (uint32)(ea) & ~scSPUCacheLineSizeMask, MASK_SET(((uint32)(ea)>>3)), (uint32)PrefDiff, (uint32)LRUIncr, 66051)
	#define __spu_cache_lookup_miss(ea, LRUIncr, PrefDiff) SPUDoLookupCacheMiss_func((uint32)(ea), (uint32)(ea) & ~scSPUCacheLineSizeMask, MASK_SET(((uint32)(ea)>>3)), (uint32)PrefDiff, (uint32)LRUIncr, 66051)
#endif

#if !defined(_NO_SPU_CACHE_ASSERT)
	#define __cache_assert(SPU_PTR, PTR, OFFSET)\
		if(SPUDoLookupCacheAssertFunc) \
		{\
			if(0 == SPUDoLookupCacheAssertFunc((uint32)SPU_PTR, (uint32)PTR + (OFFSET) / 8, __FILE__, __LINE__))\
				SPU_DEBUG_HALT;\
		}
#else
	#define __cache_assert(SPU_PTR, PTR, OFFSET)
#endif

#endif //_SPU_JOB

//------------------------------------printf----------------------------------------------------------

#if defined(SUPP_PRINTF)
	#if !defined(_SPU_JOB)
		extern void SPUPrintfHandler(const char*, ...);
	#endif

	#if defined(_SPU_JOB)
		#define SPU_PRINTF_MISS_HANDLER_FUNC SPUPrintfHandler_func
	#else
		#define SPU_PRINTF_MISS_HANDLER_FUNC SPUPrintfHandler
	#endif

	#define printf(format, args...) SPU_PRINTF_MISS_HANDLER_FUNC(format, ## args)
#else
	#if defined(SUPP_OLD_PRINTF) && !defined(printf)
		#include <spu_printf.h>
		#define printf spu_printf
	#endif
#endif //SUPP_PRINTF

#include <SPU/SPUMultiThread.h>

#if defined(_SPU_JOB)

#include <CryModuleDefs.h>
#ifndef eCryModule
#define eCryModule eCryM_System
#endif
#include <platform.h>

#include "SPUUtilities.h"
#include "Cache/CacheDefs_spu.h"
#if defined(ENABLE_HAZARD_MODE)
	#include "Cache/Cache_spu.h"
#endif

#include "CodePage/SPUBubbles.h"
#include "CodePage/BubbleLayout.h"


//------------------------------------memcpy/memset-----------------------------------------------------

namespace std
{
	__attribute__((always_inline))
	inline void *memset__VM(void *const __restrict pDest, int val, size_t size)
	{
		return Memset__VM(pDest, val, size);
	}

	__attribute__((always_inline))
	inline void *memcpy__VMM(void *const __restrict pDest, const void *const __restrict cpSrc, size_t size)
	{
		return Memcpy__VMM(pDest, cpSrc, size);
	}

	__attribute__((always_inline))
	inline void *memcpy__VML(void *const __restrict pDest, const void *const __restrict cpSrc, size_t size)
	{
		return Memcpy__VML(pDest, cpSrc, size);
	}

	__attribute__((always_inline))
	inline void *memcpy__VLM(void *const __restrict pDest, const void *const __restrict cpSrc, size_t size)
	{
		return Memcpy__VLM(pDest, cpSrc, size);
	}
}

//definitions for local memset/memcpy variants (no need for reimplementation)
#define memset__VL memset
#define memcpy__VLL memcpy

//------------------------------------cache flushing-----------------------------------------------------

__attribute__((always_inline))
inline void __flush_cache_range(const uint32 cEAFrom, const uint32 cSize)
{
	FlushCacheRange(cEAFrom, cSize, true);
}

__attribute__((always_inline))
inline void __flush_cache()
{
	SPUFlushCache(true);
	//clear LRU and cache dir
	const vec_uint4 cZero = spu_splats((uint32)0);
	const int cNumSets = *(int*)G_SPU_NUM_SETS;
	//reset cache dir entries, 4 at once to give the branch hint a chance to be set
	for(int s=0; s<cNumSets; s += 4)
	{
		g_pSPUCacheLRUCtrl[s]		= cZero;
		g_pSPUCacheDir[s]				= cZero;
		g_pSPUCacheLRUCtrl[s+1] = cZero;
		g_pSPUCacheDir[s+1]			= cZero;
		g_pSPUCacheLRUCtrl[s+2] = cZero;
		g_pSPUCacheDir[s+2]			= cZero;
		g_pSPUCacheLRUCtrl[s+3] = cZero;
		g_pSPUCacheDir[s+3]			= cZero;
	}
}

#if defined(ENABLE_HAZARD_MODE) || !defined(_NO_SPU_CACHE_HAZ_CHECK)
	#define spu_CheckCacheHazard(cond) \
	do \
	{ \
		if (__builtin_expect(!(cond), 0)) \
		{ \
			printf("%s cache hazard: (%s) in %s line %d\n",SPU_ASSERT_STRING, #cond, __FILE__, __LINE__); \
			SPU_DEBUG_HALT; \
		} \
	}\
	while (false)
#else
	#define spu_CheckCacheHazard(cond) do{}while (false)
#endif //ENABLE_HAZARD_MODE

#if defined(PREFETCH)
	#include "Cache/Cache_spu.h"
	#include <spu_mfcio.h>
	__attribute__((always_inline))
	inline void __spu_dma_pref(const uint32 cEA)
	{
		vec_uint4 *const __restrict pPrefetchLRUDir = (vec_uint4*)G_SPU_CACHE_PREF_LRU_DIR_ADDR;
		const vec_uint4 cCurPrefLRUDirCont					= *pPrefetchLRUDir;
		const uint32 cAllocatedEA										= cEA & ~scSPUCacheLineSizeMask;
		const uint32 cLRUReplIndex									= NSPU::GetReplIndex(cCurPrefLRUDirCont);
		const vec_uint4 cLRUReplMask								= spu_insert(0xFFFFFFFF, (const vec_uint4){0}, cLRUReplIndex);
		const vec_uint4 cAllocatedEA4								= spu_splats(cAllocatedEA);
		vec_uint4 *const __restrict pPrefetchDir		= (vec_uint4*)G_SPU_CACHE_PREF_DIR_ADDR;
		const vec_uint4 cCurPrefDirCont							= *pPrefetchDir;
		si_wrch(MFC_LSA,si_from_uint(G_SPU_CACHE_PREF_BUF_ADDR + (cLRUReplIndex << scSPUCacheLineSizeShift)));
//		si_wrch(MFC_EAH,si_from_uint((uint64)(cEA)>>32));//this should not be necessary at all, but it becomes very slow otherwise
		*pPrefetchLRUDir														= spu_sel(cCurPrefLRUDirCont, g_LRUCounter, cLRUReplMask);
		si_wrch(MFC_EAL,si_from_uint(cAllocatedEA));
		*pPrefetchDir																= spu_sel(cCurPrefDirCont, cAllocatedEA4, cLRUReplMask);
		si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
		si_wrch(MFC_TagID,si_from_uint(GetPrefetchTagID(cLRUReplIndex)));
		si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD));//toggle prefetch
	}
#else
	__attribute__((always_inline))
	inline void __spu_dma_pref(const uint32)
	{}
#endif//PREFETCH

//------------------------------------cache touch-----------------------------------------------------

//touch function to update the LRU counter for an existing cache line (cpSPUPtr = cached SPU ptr)
//determine the set and index within by applying some arithmetic on the SPU address
__attribute__((always_inline))
inline void __spu_cache_touch(const uint8* const __restrict cpSPUPtr, const uint32 cLRUIncr)
{
	uint32 *const __restrict pLRUSlot = 
		(uint32*)((((uint32)cpSPUPtr & ~scSPUCacheLineSizeMask) >> (scSPUCacheLineSizeShift-2)) + 
		((uint32)g_pSPUCacheLRUCtrl - ((uint32)g_pSPUCache >> (scSPUCacheLineSizeShift-2))));
	const vec_uint4 cNewLRUCntr			  = spu_add(g_LRUCounter, cLRUIncr);
	g_LRUCounter = spu_add(g_LRUCounter, 1);
	*pLRUSlot		 = spu_extract(cNewLRUCntr, 0);
};

__attribute__((always_inline))
inline void __spu_cache_touch_ex(const uint32 cPrepConst, const uint32 cLRUIncr)
{
	uint32 *const __restrict pLRUSlot = (uint32*)cPrepConst;
	const vec_uint4 cNewLRUCntr			  = spu_add(g_LRUCounter, cLRUIncr);
	g_LRUCounter = spu_add(g_LRUCounter, 1);
	*pLRUSlot		 = spu_extract(cNewLRUCntr, 0);
};

__attribute__((always_inline))
inline const uint32 __spu_cache_prep_touch(const uint8* const __restrict cpSPUPtr, const uint32 cTouchConst)
{
	return ((((uint32)cpSPUPtr & ~scSPUCacheLineSizeMask) >> (scSPUCacheLineSizeShift-2)) + cTouchConst);
}

__attribute__((always_inline))
inline const uint32 __spu_cache_init_touch()
{
	return ((uint32)g_pSPUCacheLRUCtrl - ((uint32)g_pSPUCache >> (scSPUCacheLineSizeShift-2)));
}

//------------------------------------cache lookup funcs-----------------------------------------------------

__attribute__((always_inline))
inline void __spu_cache_incr_lru_cntrl(const uint32 cIncr)
{
	g_LRUCounter = spu_add(g_LRUCounter, cIncr);
}

//cache line selection function which selects a cache line from 2 already cached, 
//	in PPU memory consecutive cache lines
__attribute__((always_inline))
inline void* __spu_cache_select
(
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const uint32 cpPPUPtrOff											//PPU offset relative to the corresponding PPU address of the first cache line
)
{
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask		= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr, 0), spu_promote((uint32)127, 0));
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0, 0), spu_promote(cAlignedSPUExist1, 0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

#if defined(DO_SPU_PROFILING)
	__attribute__((always_inline))
	inline void* __spu_cache_lookup_prof(const uint32 cEA, const uint32 cLRUIncr, const uint32 cProfID, const int cPrefDiff)
	{
		assert(cProfID < MAX_PROF_ID);
		g_ProfID = spu_promote(cProfID,0);
		return __spu_cache_lookup(cEA, cLRUIncr, cPrefDiff);
	}

	#define __cache_lookup(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((ptrdiff_t)_PTR > 256 * 1024);\
		spu_CheckCacheHazard(PROFID < MAX_PROF_ID);\
		RESULT = __spu_cache_lookup_prof((uint32)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

	#define __cache_lookup_ex(RESULT, SPU_PTR_EXIST, PPU_PTR_EXIST, OFFSET_EXIST, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)SPU_PTR_EXIST < 256 * 1024);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST);\
		spu_CheckCacheHazard((ptrdiff_t)_PTR > 256 * 1024);\
		spu_CheckCacheHazard(PROFID < MAX_PROF_ID);\
		const uint8_t *const _PPU_PTR_EXIST = \
		(const uint8_t *)(PPU_PTR_EXIST) + (OFFSET_EXIST) / 8; \
		RESULT = __spu_cache_lookup_ex((const uint8_t *const)SPU_PTR_EXIST, _PPU_PTR_EXIST, (const uint8* const __restrict)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

	#define __cache_select(RESULT, SPU_PTR_EXIST0, SPU_PTR_EXIST1, OFFSET, PROFID) ({ \
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST0);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST1);\
		RESULT = __spu_cache_select((const uint8_t *const)SPU_PTR_EXIST0, (const uint8_t *const)SPU_PTR_EXIST1, OFFSET/8); })

#else

	#define __cache_lookup(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((ptrdiff_t)_PTR > 256 * 1024);\
		RESULT = __spu_cache_lookup(_PTR, LRU_INCR, PREF_DIFF); })

	#define __cache_lookup_miss(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((ptrdiff_t)_PTR > 256 * 1024);\
		RESULT = __spu_cache_lookup_miss(_PTR, LRU_INCR, PREF_DIFF); })

	#define __cache_lookup_ex(RESULT, SPU_PTR_EXIST, PPU_PTR_EXIST, OFFSET_EXIST, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)SPU_PTR_EXIST < 256 * 1024);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST);\
		spu_CheckCacheHazard((ptrdiff_t)_PTR > 256 * 1024);\
		const uint8_t *const _PPU_PTR_EXIST = \
		(const uint8_t *)(PPU_PTR_EXIST) + (OFFSET_EXIST) / 8; \
		RESULT = __spu_cache_lookup_ex((const uint8_t *const)SPU_PTR_EXIST, _PPU_PTR_EXIST, (const uint8* const __restrict)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

	#define __cache_select(RESULT, SPU_PTR_EXIST0, SPU_PTR_EXIST1, OFFSET, PROFID) ({ \
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST0);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST1);\
		RESULT = __spu_cache_select((const uint8_t *const)SPU_PTR_EXIST0, (const uint8_t *const)SPU_PTR_EXIST1, OFFSET/8); })
#endif //DO_SPU_PROFILING

//extended cache lookup function which checks first if cpPtr (PPU pointer) lies in the same cache line specified by
//  cpSPUPtrExist (cpPPUPtrExist is corresponding PPU pointer)
//keep in sync with definition in SPUMemManager_spu.h
__attribute__((always_inline))
inline void* __spu_cache_lookup_ex
(
	const uint8* const __restrict cpSPUPtrExist, 
	const uint8* const __restrict cpPPUPtrExist,
	const uint8* const __restrict cpPtr,
	const uint32 cLRUIncr,
	const uint32 cProfID,
	const int cPrefDiff
)
{
	//check for valid pointer
	//	spu_CheckCacheHazard((uint32)cpSPUPtrExist >= (uint32)g_pSPUCache);
	//	CheckIsPresent(cpSPUPtrExist);
	const uint32 cAlignedPPUEA		 = (uint32)cpPPUPtrExist & ~scSPUCacheLineSizeMask;
	const uint32 cAlignedPPtrPPUEA = (uint32)cpPtr & ~scSPUCacheLineSizeMask;
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
	{
		const uint32 cAlignedSPUEA		 = (uint32)cpSPUPtrExist & ~scSPUCacheLineSizeMask;
		const uint32 cOffsetPPUPtr		 = (uint32)cpPtr & scSPUCacheLineSizeMask;
		return (void*)(cAlignedSPUEA + cOffsetPPUPtr);
	}
#if defined(DO_SPU_PROFILING)
	return __spu_cache_lookup_prof((uint32)cpPtr, cLRUIncr, cProfID, cPrefDiff);
#else
	return __spu_cache_lookup((uint32)cpPtr, cLRUIncr, cPrefDiff);
#endif
}

__attribute__((always_inline))
inline void* __spu_cache_lookup0_select
(
	const uint8* const __restrict cpPPUPTR,				//ppu pointer of first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint32 cLRUIncr,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist1);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR > 256 * 1024);
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpPPUPTR & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr, 0), spu_promote((uint32)127, 0));
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((uint32)cpPPUPTR, cLRUIncr, cProfID, cPrefDiff);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((uint32)cpPPUPTR, cLRUIncr, cPrefDiff);
#endif
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)(*cppSPUPtrExist0) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup_ex for the first cache line in relation to the second one
__attribute__((always_inline))
inline void* __spu_cache_lookup0_select_ex
(
	const uint8* const __restrict cpPPUPTR,				//ppu pointer of first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint32 cDiff,														//pointer diff between both cache lines
	const uint32 cLRUIncr,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist1);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR > 256 * 1024);
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpPPUPTR & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((uint32)127,0));
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_ex(cpSPUPtrExist1, cpPPUPTR+cDiff, cpPPUPTR, cLRUIncr, cProfID, cPrefDiff);
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)(*cppSPUPtrExist0) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

__attribute__((always_inline))
inline void* __spu_cache_lookup1_select
(
	const uint8* const __restrict cpPPUPTR1,			//ppu pointer of second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint32 cLRUIncr,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist0);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR1 > 256 * 1024);
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((uint32)127,0));
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_prof((uint32)cpPPUPTR1, cLRUIncr, cProfID, cPrefDiff);
#else
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup((uint32)cpPPUPTR1, cLRUIncr, cPrefDiff);
#endif
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)(*cppSPUPtrExist1) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup_ex for the second cache line in relation to the first one
__attribute__((always_inline))
inline void* __spu_cache_lookup1_select_ex
(
	const uint8* const __restrict cpPPUPTR1,			//ppu pointer of second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint8* const __restrict cpPPUPTR0,			//pointer diff between both cache lines
	const uint32 cLRUIncr,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist0);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR1 > 256 * 1024);
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((uint32)127,0));
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_ex(cpSPUPtrExist0, cpPPUPTR0, cpPPUPTR1, cLRUIncr, cProfID, cPrefDiff);
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)(*cppSPUPtrExist1) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

__attribute__((always_inline))
inline void* __spu_cache_lookup01_select
(
	const uint8* const __restrict cpPPUPTR0,				//PPU address for first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpPPUPTR1,				//PPU address for second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint32 cLRUIncr0,
	const uint32 cLRUIncr1,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff0,
	const int cPrefDiff1
)
{
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR0 > 256 * 1024);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR1 > 256 * 1024);
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpPPUPTR0 & scSPUCacheLineSizeMask);
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((uint32)cpPPUPTR0, cLRUIncr0, cProfID, cPrefDiff0);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((uint32)cpPPUPTR0, cLRUIncr0, cPrefDiff0);
#endif
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((uint32)127,0));
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_prof((uint32)cpPPUPTR1, cLRUIncr1, cProfID, cPrefDiff1);
#else
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup((uint32)cpPPUPTR1, cLRUIncr1, cPrefDiff1);
#endif
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)*cppSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup for the first cache line and a 
//	__spu_cache_lookup_ex for the second cache line in relation to the first one
__attribute__((always_inline))
inline void* __spu_cache_lookup01_select_ex
(
	const uint8* const __restrict cpPPUPTR0,			//PPU address for first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpPPUPTR1,			//PPU address for second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint32 cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint32 cLRUIncr0,
	const uint32 cLRUIncr1,
	const uint32 cProfID,													//profiling ID
	const int cPrefDiff0,
	const int cPrefDiff1
)
{
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR0 > 256 * 1024);
	spu_CheckCacheHazard((ptrdiff_t)cpPPUPTR1 > 256 * 1024);
	const uint32 cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((uint32)cpPPUPTR0 & scSPUCacheLineSizeMask);
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((uint32)cpPPUPTR0, cLRUIncr0, cProfID, cPrefDiff0);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((uint32)cpPPUPTR0, cLRUIncr0, cPrefDiff0);
#endif
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((uint32)127,0));
	const uint32 cAlignedSPUExist0 _ALIGN(16)		= (uint32)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_ex(*cppSPUPtrExist0, cpPPUPTR0, cpPPUPTR1, cLRUIncr1, cProfID, cPrefDiff1);
	const uint32 cAlignedSPUExist1 _ALIGN(16)		= (uint32)*cppSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

#define __cache_interpolate(RESULT, LOCAL0, LOCAL1, OFFSET, DIFF, MEMREF_ID) ({ \
	__cache_select(RESULT, LOCAL0, LOCAL1, OFFSET, MEMREF_ID); })

#define __cache_lookup0_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	RESULT = __spu_cache_lookup0_select(_PTR, (const uint8** __restrict)&(LOCAL0), (const uint8* const __restrict)LOCAL1, OFFSET/8, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup1_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup1_select(_PTR1, (const uint8**)&(LOCAL1), (const uint8* const __restrict)LOCAL0, OFFSET/8, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup01_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR0, LRU_INCR1, PREF_DIFF0, PREF_DIFF1, MEMREF_ID) ({ \
	const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup01_select(_PTR0, (const uint8** __restrict)&(LOCAL0), _PTR1, (const uint8** __restrict)&(LOCAL1), OFFSET/8, LRU_INCR0, LRU_INCR1, MEMREF_ID, PREF_DIFF0, PREF_DIFF1); })

#define __cache_lookup0_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	RESULT = __spu_cache_lookup0_ex_select(_PTR, (const uint8** __restrict)&(LOCAL0), (const uint8* const __restrict)LOCAL1, OFFSET/8, DIFF, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup1_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	const uint8_t *const _PTR1 = _PTR0 + (DIFF) / 8; \
	RESULT = __spu_cache_lookup1_select_ex(_PTR1, (const uint8** __restrict)&(LOCAL1), (const uint8* const __restrict)LOCAL0, OFFSET/8, _PTR0, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup01_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR0, LRU_INCR1, PREF_DIFF0, PREF_DIFF1, MEMREF_ID) ({ \
	const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup01_select_ex(_PTR0, (const uint8** __restrict)&(LOCAL0), _PTR1, (const uint8** __restrict)&(LOCAL1), OFFSET/8, LRU_INCR0, LRU_INCR1, MEMREF_ID, PREF_DIFF0, PREF_DIFF1); })

#define __cache_touch(PTR, OFFSET, LRU_INCR, PROFID) ({ \
	const uint8_t *const _PTR = \
	(const uint8_t *)(PTR) + (OFFSET) / 8; \
	spu_CheckCacheHazard((uint32)_PTR >= (uint32)g_pSPUCache);\
	CheckIsPresent((uint32)_PTR);\
	__spu_cache_touch((const uint8* const)_PTR, LRU_INCR); })

#define __cache_touch_prep(PTR, OFFSET, TOUCH_CONST) ({ \
	const uint8_t *const _PTR = \
	(const uint8_t *)(PTR) + (OFFSET) / 8; \
	__spu_cache_prep_touch((const uint8* const)_PTR, TOUCH_CONST); })

#define __cache_touch_ex(ADDR, LRU_INCR, PROFID) __spu_cache_touch_ex(ADDR, LRU_INCR)

#define __cache_range_write_async(cpFrom, cpTo) SPUAddCacheWriteRangeAsync((uint32)(cpFrom), (uint32)(cpTo))

__attribute__((always_inline))
inline void SPUAddCacheWriteRangeAsync(const uint32 cEAFrom, const uint32 cEATo)
{
	vec_uint4 *const __restrict pAsyncRangesDirFrom = (vec_uint4*)G_SPU_CACHE_ASYNC_RANGES_DIR_FROM;
	vec_uint4 *const __restrict pAsyncRangesDirTo		= (vec_uint4*)G_SPU_CACHE_ASYNC_RANGES_DIR_TO;
	vec_uint4 curAsyncRangesDirFrom = *pAsyncRangesDirFrom;
	vec_uint4 curAsyncRangesDirTo		= *pAsyncRangesDirTo;

	//start at next cache line boundary (unsafe to start within a cacheline)
	const uint32 cEAAlignedFrom = (cEAFrom + scSPUCacheLineSizeMask) & ~scSPUCacheLineSizeMask;
	const uint32 cEAAlignedTo		= (cEATo + scSPUCacheLineSizeMask) & ~scSPUCacheLineSizeMask;
	//rotate existing one 4 bytes and insert into slot 0
	curAsyncRangesDirFrom	= spu_rlqwbyte(curAsyncRangesDirFrom, 4);
	curAsyncRangesDirTo		= spu_rlqwbyte(curAsyncRangesDirTo, 4);
	curAsyncRangesDirFrom	= spu_insert(cEAAlignedFrom, curAsyncRangesDirFrom, 0);
	curAsyncRangesDirTo		= spu_insert(cEAAlignedTo, curAsyncRangesDirTo, 0);

	*pAsyncRangesDirFrom	= curAsyncRangesDirFrom;
	*pAsyncRangesDirTo		= curAsyncRangesDirTo;
};

//------------------------------------memory allocation---------------------------------------------------

#undef malloc
#undef realloc
#undef free
#define malloc  CryModuleMalloc
#define realloc CryModuleRealloc
#define free    CryModuleFree

namespace std
{
	__attribute__((always_inline))
	inline void* CryMemAlign(const size_t cBoundary, const size_t cSize)
	{
		void *pPtr = NULL;
		PPUAlloc_func(pPtr, (cSize + (cBoundary-1)) & ~(cBoundary-1));
		assert((uint32)pPtr & (cBoundary-1) == 0);
		return pPtr;
	}
}
using std::CryMemAlign;
#define memalign CryMemAlign

__attribute__((always_inline))
inline void* CryModuleMalloc(const size_t cSize) throw()
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
};

__attribute__((always_inline))
inline void* CryModuleRealloc(void *pPtr, const size_t cSize)  throw()
{
	PPUFree_func(pPtr);
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
};

__attribute__((always_inline))
inline void CryModuleFree(void *pPtr) throw()
{
	PPUFree_func(pPtr);
};

__attribute__((always_inline))
inline void* CryModuleMalloc(const size_t cSize, ECryModule) throw()
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
};

__attribute__((always_inline))
inline void* CryModuleRealloc(void *pPtr, const size_t cSize, ECryModule)  throw()
{
	PPUFree_func(pPtr);
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
};

__attribute__((always_inline))
inline void CryModuleFree(void *pPtr, ECryModule) throw()
{
	PPUFree_func(pPtr);
};

__attribute__((always_inline))
inline void * operator new(size_t cSize) throw (std::bad_alloc) 
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void* operator new (size_t cSize, const std::nothrow_t &nothrow) throw()
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void* operator new[](size_t cSize) throw (std::bad_alloc) 
{ 
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void* operator new[] (size_t cSize, const std::nothrow_t &nothrow) throw()
{ 
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void operator delete(void *pPtr) throw()
{
	PPUFree_func(pPtr); 
}

__attribute__((always_inline))
inline void operator delete(void *pPtr, const std::nothrow_t&) throw()
{
	PPUFree_func(pPtr); 
}

__attribute__((always_inline))
inline void operator delete[](void *pPtr) throw()
{ 
	PPUFree_func(pPtr);
}

__attribute__((always_inline))
inline void operator delete[](void *pPtr, const std::nothrow_t&) throw()
{ 
	PPUFree_func(pPtr);
}

__attribute__((always_inline))
inline void *operator new(_CSTD size_t cSize, _CSTD size_t cAlignment) throw (std::bad_alloc)
{
	assert(cAlignment <= 128);
	const uint32 cSizeAligned = cSize;//CondSelMax((uint32)cSize, (uint32)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new(_CSTD size_t cSize, _CSTD size_t cAlignment, const std::nothrow_t&) throw()
{
	assert(cAlignment <= 128);
	const uint32 cSizeAligned = cSize;//CondSelMax((uint32)cSize, (uint32)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new[](_CSTD size_t cSize, _CSTD size_t cAlignment) throw (std::bad_alloc)
{
	assert(cAlignment <= 128);
	const uint32 cSizeAligned = cSize;//CondSelMax((uint32)cSize, (uint32)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new[](_CSTD size_t cSize, _CSTD size_t cAlignment, const std::nothrow_t&) throw()
{
	assert(cAlignment <= 128);
	const uint32 cSizeAligned = cSize;//CondSelMax((uint32)cSize, (uint32)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

//------------------------------------code paging---------------------------------------------------

//sets the currently active bubble IDs
//asm inline instructions are there to create the instruction slot for the bubble id replacement
//this is necessary since the extern int placeholders create lqr instructions and this is on pipe 1 
//	(other than pipe 0 where ila is located at) (do not change the instruction, binary op code is looked up)
__attribute__((always_inline))
inline void __spu_set_active_bubbles(const int cID0)
{
	__asm__ volatile ("ori $79,$79,0" :  : );
	SPUSetActiveBubblesFunc(cID0, -1, -1, -1);
};

__attribute__((always_inline))
inline void __spu_set_active_bubbles(const int cID0, const int cID1)
{
	__asm__ volatile ("ori $79,$79,0" :  : );
	__asm__ volatile ("ori $79,$79,1" :  : );
	SPUSetActiveBubblesFunc(cID0, cID1, -1, -1);
};

//sets the currently active bubble IDs
__attribute__((always_inline))
inline void __spu_set_active_bubbles(const int cID0, const int cID1, const int cID2)
{
	__asm__ volatile ("ori $79,$79,0" :  : );
	__asm__ volatile ("ori $79,$79,1" :  : );
	__asm__ volatile ("ori $79,$79,2" :  : );
	SPUSetActiveBubblesFunc(cID0, cID1, cID2, -1);
};

//sets the currently active bubble IDs
__attribute__((always_inline))
inline void __spu_set_active_bubbles(const int cID0, const int cID1, const int cID2, const int cID3)
{
	__asm__ volatile ("ori $79,$79,0" :  : );
	__asm__ volatile ("ori $79,$79,1" :  : );
	__asm__ volatile ("ori $79,$79,2" :  : );
	__asm__ volatile ("ori $79,$79,3" :  : );
	SPUSetActiveBubblesFunc(cID0, cID1, cID2, cID3);
};

//init the upcoming cross bubble call, force an insertion of a lqr instruction into $71
//also increment the return stack
__attribute__((always_inline))
inline void __spu_init_cross_call(const vec_ushort8 cCrossData)
{
	g_CrossBubbleData = cCrossData;
}

__attribute__((always_inline))
inline const uint32 __cache_resolve_global_var_addr(const int cGlobVarOff)
{
	__asm__ volatile ("ilhu $71,0" :  : );//placeholder for upper integer load instr.
	__asm__ volatile ("iohl $71,0" :  : );//placeholder for lower integer load instr.
	return SPUResolveGlobalVarFunc(cGlobVarOff);
}

__attribute__((always_inline))
inline void SetEnableSPUJobAtParentExit(const uint32 cNewPPUPushEA)
{
	//this makes the parent job not call any callback or set the external job state
	vec_uint4 *const __restrict pJobSpawnReg = (vec_uint4*)(void*)G_SPU_JOB_SPAWN_REG;
	assert(spu_extract(*pJobSpawnReg, JOB_SPAWN_STATE_WORD) == 0);//check if there is not already been a registered job
	*pJobSpawnReg = spu_insert(cNewPPUPushEA, *pJobSpawnReg, JOB_SPAWN_PUSH_WORD);
}

#if !defined(_LIB_DRIVER)
	#include <PPU/SPUJobBase.h>
#endif

//--------------------------------------------------------------------------------------------------

#if defined(SUPP_SN)
	__attribute__((always_inline))
	inline const int IsDebugEnabled()
	{
		return *(int*)G_SPU_DEBUG_STATE;
	}
#endif

__attribute__((always_inline))
inline ILog* GetISPULog(){return (ILog*)G_SPU_LOG;}

__attribute__((always_inline))
inline void LogMessageV(const char* szFormat, va_list args)
{
	SPULogMessageV(szFormat, args);
}

#else //_SPU_JOB
	inline void __spu_dma_pref(const unsigned int){}
#endif //_SPU_JOB

#if defined(SUPP_SN)
	#undef SPU_DEBUG_BREAK
	#define SPU_DEBUG_BREAK if(IsDebugEnabled()) __asm volatile ("stop 255")
#else
	#define SPU_DEBUG_BREAK
#endif

#endif //__SPU__
#endif //PS3
#endif //__SPU_JOB_H
