/*
	all required headers for a job
	this must never be included from a header file or non SPU job (_SPU_JOB will be defined from the makefile)
*/
#ifndef __SPU_JOB_H
#define __SPU_JOB_H
#pragma once

#if defined(PS3) && defined(__cplusplus)

#if defined(__SPU__) 

#include <SPU/SPU.h>
#include <IJobManSPU.h>
#include <stdarg.h>

#undef abort
#define abort _exit

#if defined(SUPP_SN)
	#include <libsn_spu.h>
#endif

#include "Cache/CacheDefs_spu.h"

#include <cell/dma.h>
//prepares the mfc_getllar command without issuing it
#define mfc_prep(pLS, cEAAddr)({\
	assert((unsigned int)(pLS) < 256*1024);\
	si_wrch(MFC_LSA,si_from_ptr(pLS));\
	si_wrch(MFC_EAL,si_from_uint(mfc_ea2l(cEAAddr)));})

//atomic functions which follow a call to mfc_getllar or mfc_putllc using the same parameters (keep channels)
//they just issue the command reusing all set registers from the previous calls
#define mfc_getllar_again() si_wrch(MFC_Cmd, si_from_uint(208))
#define mfc_putllc_again() si_wrch(MFC_Cmd, si_from_uint(180))
#define mfc_putlluc_again() si_wrch(MFC_Cmd, si_from_uint(MFC_PUTLLUC_CMD)) 

//------------------------------------software cache---------------------------------------------------

#if defined(_SPU_JOB)
typedef const int (*const TSPUResolveGlobalVarFunc)(const int);
typedef void (*const TSPUPrintfHandler)(const char*, ...);
#if defined(ENABLE_HAZARD_MODE)
	typedef void* (*const TSPUDoLookupCacheFunc)(const unsigned int, const unsigned int, const int);
#else
	typedef void* (*const TSPUDoLookupCacheFunc)(const unsigned int, const unsigned int, const unsigned int, const unsigned int, const unsigned int, const unsigned int);
#endif//ENABLE_HAZARD_MODE
typedef int (*const TSPUCacheAssertFunc)(const unsigned int, const unsigned int, const char*, const unsigned int);
typedef void (*const TSPUSetActivePagesFunc)(const vec_uchar16, const vec_uchar16, const vec_uchar16, const vec_uchar16);
typedef void (*const TSPUSyncAtomicDCache)();
typedef void (*const TSPUStartAtomicWrite)();
typedef void (*const TLogMessageV)(const char*, va_list);
#if defined(DO_SPU_PROFILING)
	typedef void (*const TSPUFlushCacheFunc)(const bool, const bool);
#else
	typedef void (*const TSPUFlushCacheFunc)(const bool);
#endif
typedef void (*const TSPUDMAPrefFunc)(const unsigned int);
typedef int	 (*const TSPUGetCurPageID)();
typedef unsigned int (*const TSPUGetCurPageAddr)();
typedef vec_ushort8 (*const TSPUGenFuncPtrFromId)(const unsigned int);
typedef void (*const TSPUExecPPUCall)(const unsigned int, const unsigned int);
typedef void (*const TSPUProfDataHandlerFunc)();
typedef void (*const TSPUTransferFuncProfStatsFunc)();
typedef const unsigned int (*const TSPUFuncPtrAssertFunc)(const unsigned int, const unsigned int);

//external functions coming from libDriverDMA.a
extern void* Memset__VM(void*, int, unsigned int);
extern void MemsetLargeNoCache128(void*, unsigned int, unsigned int);
extern void* Memcpy__VLM(void* const, const void* const, const unsigned int);
extern void* Memcpy__VML(void* const, const void* const, const unsigned int);
extern void* Memcpy__VMM(void* const, const void* const, const unsigned int);
extern void FlushCacheRange(const unsigned int, const unsigned int, const bool);
extern void ReloadCacheLine(const unsigned int , const unsigned int );
extern void* DoVolatileCacheLookup(const unsigned int , const unsigned int , const int);
extern void WriteBackCacheLine(const unsigned int, const unsigned int);
extern void CustomCallbackHandler(void*, void*);
extern void PPUFree_func(void*);
extern void PPUAlloc_func(void*&, const size_t);
extern void PPUCalloc_func(void*&, const size_t, const size_t);
extern void CleanupMemory();
extern void SPUExecPPUCall(const unsigned int, const unsigned int);

#define PPU_JOB_DEF(name) < name ## _ppu_include.h>

namespace NPPU{class CSPUJobDel;}
extern const NPPU::EAddJobRes RunSPUJob
(
	const NPPU::CSPUJobDel&,
	const unsigned int, 
	const unsigned char, 
	const unsigned int,
	const bool,
	const unsigned short
);

//address of memory manager, will be updated by makefile
//do not edit these lines by hand, accessed in SPUDriverMemManagerParser
//-------------------------------------------------------------------------------------------------

#if defined(_DEBUG)

static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)17024;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)16768;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)11312;
static TSPUSetActivePagesFunc SPUSetActivePagesFunc = (const TSPUSetActivePagesFunc)19712;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)11288;
static TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)11200;
static TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)10968;
static TLogMessageV SPULogMessageV = (TLogMessageV)0;
static TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)11560;
static TSPUDMAPrefFunc SPUDMAPref = (TSPUDMAPrefFunc)12360;
static TSPUGetCurPageID SPUGetCurPageID = (TSPUGetCurPageID)14720;
static TSPUGetCurPageAddr SPUGetCurPageAddr = (TSPUGetCurPageAddr)14776;
static TSPUGenFuncPtrFromId SPUGenFuncPtrFromId = (TSPUGenFuncPtrFromId)14824;
static TSPUProfDataHandlerFunc SPUProfDataHandlerFunc = (TSPUProfDataHandlerFunc)0;
static TSPUFuncPtrAssertFunc SPUFuncPtrAssertFunc = (TSPUFuncPtrAssertFunc)16408;
static TSPUTransferFuncProfStatsFunc SPUTransferFuncProfStatsFunc = (TSPUTransferFuncProfStatsFunc)15088;
#define G_SPU_CACHE_PROF_ID_COUNTER_ADDR 0
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 0
#define G_SPU_CACHE_PREF_DIR_ADDR 26256
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 26240
#define G_SPU_CACHE_PREF_BUF_ADDR 24192
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 26272
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 26288
#define G_SPU_CACHE_CUR_ATOMIC_EA 26224
#define G_SPU_CACHE_SHADOW_CACHE 26176
#define G_SPU_INFO_BLOCK 23168
#define G_SPU_PAGE_DIR_INFO 26096
#define G_SPU_JOB_SPAWN_REG 24896
#define G_SPU_DEBUG_STATE 24864
#define G_SPU_NUM_SETS 26160
#define G_SPU_LOG 0
#define G_SPU_PROGRAM_TOP_LS 24784
#define G_SPU_CPMH 18048
#define G_SPU_HTL 20480
#define G_SPU_LHTL 21056
#define G_SPU_ATOMIC_BUF 23552
#define G_SPU_GCM_CONTEXT_LOCAL_ADDR 24832
#define G_SPU_ZERO16 24912
#define G_SPU_INFO_PACKET_ADDR 24800
#define G_SPU_DABR_ADDR 0
#define G_SPU_JOB_RESOLVE_FUNC_ADDR 24768
#define G_FRAME_STATS_DEST_AREA 24944
#define G_SPU_ID_ADDR 24848
#define G_SPU_CUSTOM_CALLBACK_AREA 24976
#define G_SPU_MEM_MAN 23296
#define G_SPU_DEST_MEM_AREA 24880
#define G_SPU_FUNC_PROF_SPU_AREA 24992            

#elif defined(DO_SPU_PROFILING)

static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)11904;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)11648;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)0;
static TSPUSetActivePagesFunc SPUSetActivePagesFunc = (const TSPUSetActivePagesFunc)13888;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)7368;
static TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)7200;
static TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)6952;
static TLogMessageV SPULogMessageV = (TLogMessageV)0;
static TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)7392;
static TSPUDMAPrefFunc SPUDMAPref = (TSPUDMAPrefFunc)8328;
static TSPUGetCurPageID SPUGetCurPageID = (TSPUGetCurPageID)10736;
static TSPUGetCurPageAddr SPUGetCurPageAddr = (TSPUGetCurPageAddr)10792;
static TSPUGenFuncPtrFromId SPUGenFuncPtrFromId = (TSPUGenFuncPtrFromId)10840;
static TSPUProfDataHandlerFunc SPUProfDataHandlerFunc = (TSPUProfDataHandlerFunc)6840;
static TSPUFuncPtrAssertFunc SPUFuncPtrAssertFunc = (TSPUFuncPtrAssertFunc)10952;
static TSPUTransferFuncProfStatsFunc SPUTransferFuncProfStatsFunc = (TSPUTransferFuncProfStatsFunc)11040;
#define G_SPU_CACHE_PROF_ID_COUNTER_ADDR 19472
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 18016
#define G_SPU_CACHE_PREF_DIR_ADDR 31792
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 31776
#define G_SPU_CACHE_PREF_BUF_ADDR 17280
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 31808
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 31824
#define G_SPU_CACHE_CUR_ATOMIC_EA 19440
#define G_SPU_CACHE_SHADOW_CACHE 19392
#define G_SPU_INFO_BLOCK 16256
#define G_SPU_PAGE_DIR_INFO 19312
#define G_SPU_JOB_SPAWN_REG 17984
#define G_SPU_DEBUG_STATE 17952
#define G_SPU_NUM_SETS 19376
#define G_SPU_LOG 0
#define G_SPU_PROGRAM_TOP_LS 17872
#define G_SPU_CPMH 12672
#define G_SPU_HTL 14464
#define G_SPU_LHTL 15040
#define G_SPU_ATOMIC_BUF 16640
#define G_SPU_GCM_CONTEXT_LOCAL_ADDR 17920
#define G_SPU_ZERO16 18000
#define G_SPU_INFO_PACKET_ADDR 17888
#define G_SPU_DABR_ADDR 0
#define G_SPU_JOB_RESOLVE_FUNC_ADDR 17856
#define G_FRAME_STATS_DEST_AREA 18160
#define G_SPU_ID_ADDR 17936
#define G_SPU_CUSTOM_CALLBACK_AREA 18192
#define G_SPU_MEM_MAN 16384
#define G_SPU_DEST_MEM_AREA 17968
#define G_SPU_FUNC_PROF_SPU_AREA 18208

#else

static TSPUDoLookupCacheFunc SPUDoLookupCache_func = (const TSPUDoLookupCacheFunc)8576;
static TSPUDoLookupCacheFunc SPUDoLookupCacheMiss_func = (const TSPUDoLookupCacheFunc)8448;
static TSPUDoLookupCacheFunc SPUDoLookupCacheChecked_func = (const TSPUDoLookupCacheFunc)0;
static TSPUPrintfHandler SPUPrintfHandler_func = (const TSPUPrintfHandler)0;
static TSPUCacheAssertFunc SPUDoLookupCacheAssertFunc = (const TSPUCacheAssertFunc)0;
static TSPUSetActivePagesFunc SPUSetActivePagesFunc = (const TSPUSetActivePagesFunc)12800;
static TSPUResolveGlobalVarFunc SPUResolveGlobalVarFunc = (TSPUResolveGlobalVarFunc)6968;
static TSPUSyncAtomicDCache SPUSyncAtomicDCache = (TSPUSyncAtomicDCache)7408;
static TSPUStartAtomicWrite SPUStartAtomicWrite = (TSPUStartAtomicWrite)6736;
static TLogMessageV SPULogMessageV = (TLogMessageV)0;
static TSPUFlushCacheFunc SPUFlushCache = (TSPUFlushCacheFunc)10880;
static TSPUDMAPrefFunc SPUDMAPref = (TSPUDMAPrefFunc)6992;
static TSPUGetCurPageID SPUGetCurPageID = (TSPUGetCurPageID)7192;
static TSPUGetCurPageAddr SPUGetCurPageAddr = (TSPUGetCurPageAddr)7248;
static TSPUGenFuncPtrFromId SPUGenFuncPtrFromId = (TSPUGenFuncPtrFromId)7296;
static TSPUProfDataHandlerFunc SPUProfDataHandlerFunc = (TSPUProfDataHandlerFunc)0;
static TSPUFuncPtrAssertFunc SPUFuncPtrAssertFunc = (TSPUFuncPtrAssertFunc)7704;
static TSPUTransferFuncProfStatsFunc SPUTransferFuncProfStatsFunc = (TSPUTransferFuncProfStatsFunc)7792;
#define G_SPU_CACHE_PROF_ID_COUNTER_ADDR 0
#define G_SPU_CACHE_PROF_PERF_STAT_ADDR 0
#define G_SPU_CACHE_PREF_DIR_ADDR 18048
#define G_SPU_CACHE_PREF_LRU_DIR_ADDR 18032
#define G_SPU_CACHE_PREF_BUF_ADDR 16000
#define G_SPU_CACHE_ASYNC_RANGES_DIR_FROM 18064
#define G_SPU_CACHE_ASYNC_RANGES_DIR_TO 18080
#define G_SPU_CACHE_CUR_ATOMIC_EA 18016
#define G_SPU_CACHE_SHADOW_CACHE 17968
#define G_SPU_INFO_BLOCK 14976
#define G_SPU_PAGE_DIR_INFO 17888
#define G_SPU_JOB_SPAWN_REG 16704
#define G_SPU_DEBUG_STATE 16672
#define G_SPU_NUM_SETS 17952
#define G_SPU_LOG 0
#define G_SPU_PROGRAM_TOP_LS 16592
#define G_SPU_CPMH 11776
#define G_SPU_HTL 13312
#define G_SPU_LHTL 13824
#define G_SPU_ATOMIC_BUF 15360
#define G_SPU_GCM_CONTEXT_LOCAL_ADDR 16640
#define G_SPU_ZERO16 16720
#define G_SPU_INFO_PACKET_ADDR 16608
#define G_SPU_DABR_ADDR 0
#define G_SPU_JOB_RESOLVE_FUNC_ADDR 16576
#define G_FRAME_STATS_DEST_AREA 16736
#define G_SPU_ID_ADDR 16656
#define G_SPU_CUSTOM_CALLBACK_AREA 16768
#define G_SPU_MEM_MAN 15104
#define G_SPU_DEST_MEM_AREA 16688
#define G_SPU_FUNC_PROF_SPU_AREA 16784
#endif//_DEBUG

//------------------------------------__spu_cache_lookup/__cache_assert-------------------------------
	
#if defined(_NO_SPU_ASSERT)
	#define STORE_LINE_NUM
#else
	#define STORE_LINE_NUM g_LineNum = spu_insert(__LINE__, g_LineNum, 3);
#endif

//cache lookup function, ea is PPU address, keep in sync with declaration in SPUMemManager.cpp
#if defined(ENABLE_HAZARD_MODE)
	#define __spu_cache_lookup(ea, LRUIncr, PrefDiff) SPUDoLookupCacheChecked_func((unsigned int)(ea), LRUIncr, PrefDiff)
	#define __spu_cache_lookup_miss(ea, LRUIncr, PrefDiff) SPUDoLookupCacheChecked_func((unsigned int)(ea), LRUIncr, PrefDiff)
#else
	#define __spu_cache_lookup(ea, LRUIncr, PrefDiff)({\
		SPUDoLookupCache_func((unsigned int)(ea), (unsigned int)(ea) & ~scSPUCacheLineSizeMask, MASK_SET(((unsigned int)(ea)>>3)), (unsigned int)PrefDiff, (unsigned int)LRUIncr, 66051);})
	#define __spu_cache_lookup_miss(ea, LRUIncr, PrefDiff) SPUDoLookupCacheMiss_func((unsigned int)(ea), (unsigned int)(ea) & ~scSPUCacheLineSizeMask, MASK_SET(((unsigned int)(ea)>>3)), (unsigned int)PrefDiff, (unsigned int)LRUIncr, 66051)
#endif

#if !defined(_NO_SPU_CACHE_HAZ_CHECK)
	#define CheckIsPresent(ea) \
		spu_CheckCacheHazard(0 <= \
		GetCacheIndexNum(SetCache4WayLookup(GetCacheSetIndexGlob((unsigned int)(ea)), \
		spu_splats((unsigned int)(ea) & ~scSPUCacheLineSizeMask))))
#else
		#define CheckIsPresent(ea)
#endif

#if !defined(_NO_SPU_CACHE_ASSERT)
	#define __cache_assert(SPU_PTR, PTR, OFFSET)\
		if(SPUDoLookupCacheAssertFunc) \
		{\
			if(0 == SPUDoLookupCacheAssertFunc((unsigned int)SPU_PTR, (unsigned int)PTR + (OFFSET) / 8, __FILE__, __LINE__))\
				SPU_DEBUG_HALT;\
		}
#else
	#define __cache_assert(SPU_PTR, PTR, OFFSET)
#endif

#if defined(_NO_SPU_ASSERT)
	#define STACK_ASSERT(a)
#else
	#define STACK_ASSERT(a)({\
		register unsigned int __sp __asm__("$sp");\
		if(__sp <= *(unsigned int*)(void*)G_SPU_PROGRAM_TOP_LS + (a))snPause();})
#endif

#ifdef TEST_STACK
#undef STACK_ASSERT
#define STACK_ASSERT(a)({\
register unsigned int __sp __asm__("$sp");\
if(__sp <= *(unsigned int*)(void*)G_SPU_PROGRAM_TOP_LS + 512 + (a))snPause();})
#endif
//printf("%s\n",__func__);\

#if !defined(JOB_LIB_COMP)
/*	#undef CELL_GCM_RESERVE
	#define CELL_GCM_RESERVE(a) \
		assert((unsigned int)a + (unsigned int)GetGcmSPUData()->contextData.current - ((unsigned int)GetGcmSPUData()->contextData.begin + GetGcmSPUData()->localContextOffset) < LOCAL_SPU_CMD_BUF_SIZE)
*/
	#undef CELL_GCM_ASSERT
	#define CELL_GCM_ASSERT(a) assert(a)

	#undef CELL_GCM_ASSERTS
	#define CELL_GCM_ASSERTS(a,mess) assert(a)
#endif

#endif //_SPU_JOB

//------------------------------------printf----------------------------------------------------------

#define printf$VL_ printf
#define sprintf$VLL_ sprintf
#define __builtin_puts(a) printf("%s\n",a)
#if defined(SUPP_PRINTF)
	#if !defined(_SPU_JOB)
		extern void SPUPrintfHandler(const char*, ...);
	#endif
 
	#if defined(_SPU_JOB)
		#define SPU_PRINTF_MISS_HANDLER_FUNC SPUPrintfHandler_func
	#else
		#define SPU_PRINTF_MISS_HANDLER_FUNC SPUPrintfHandler
	#endif

	#define printf(format, args...) SPU_PRINTF_MISS_HANDLER_FUNC(format, ## args)
#else
	#if defined(SUPP_OLD_PRINTF) && !defined(printf)
		#include <spu_printf.h>
		#define printf spu_printf
	#endif
#endif //SUPP_PRINTF

#if defined(_SPU_JOB)
__attribute__((always_inline))
inline unsigned int GetCurrentThreadId()
{
	return *(unsigned int*)(void*)(G_SPU_INFO_BLOCK + 32);
}

#include <SPU/SPUMultiThread.h>

#include <CryModuleDefs.h>
#ifndef eCryModule
#define eCryModule eCryM_System
#endif
#include <platform.h>

#include "SPUUtilities.h"
#include "Cache/Cache_spu.h"

#include "CodePage/SPUPages.h"
#include "CodePage/SPUPageLayout.h"

#include <SPU/FuncHistTable/SPUFuncPtr.h>

//------------------------------------memcpy/memset-----------------------------------------------------

#define memset_large_nocache_128$VM_ MemsetLargeNoCache128
#define memset$VM_	Memset__VM
#define memcpy$VMM_	Memcpy__VMM
#define memcpy$VML_	Memcpy__VML
#define memcpy$VLM_	Memcpy__VLM

//definitions for local memset/memcpy variants (no need for reimplementation)
#define memset$VL_ memset
#define memcpy$VLL_ memcpy

//builtin-memcpy maps to memcpy
#define __builtin_memcpy$VMM_ Memcpy__VMM
#define __builtin_memcpy$VML_ Memcpy__VML
#define __builtin_memcpy$VLM_ Memcpy__VLM
#define __builtin_memcpy$VLL_ memcpy

//transfers main memory without any cache awareness or syncing, same dma tag as below
#define __spu_dma_to_main_no_cache_no_sync(pDestMain, pLS, cSize)({\
	assert(((unsigned int)(pDestMain) & 15) == ((unsigned int)(pLS) & 15));\
	MemcpyMain((unsigned int)(pDestMain), (void*)(pLS), cSize, USER_DMA_TAG_BASE+1);})

#define __spu_dma_to_main_no_cache_no_sync$VML_ __spu_dma_to_main_no_cache_no_sync

//transfers from main memory without any cache awareness or syncing
#define __spu_dma_to_ls_no_cache_no_sync(pLS, pMain, cSize)({\
	assert(((unsigned int)(pMain) & 15) == ((unsigned int)(pLS) & 15));\
	MemcpyLS((void*)(pLS), (unsigned int)(pMain), cSize, USER_DMA_TAG_BASE+1);})

#define __spu_dma_to_ls_no_cache_no_sync$VLM_ __spu_dma_to_ls_no_cache_no_sync

#define __spu_sync_dma_no_cache_no_sync()	SyncMemory(USER_DMA_TAG_BASE+1)

//transfers 16 byte to main memory without any cache awareness or syncing
#define __spu_zero_mem16_no_cache_no_sync(pDestMain)({\
	assert(((unsigned int)(pDestMain) & 15) == 0);\
	MemcpyMain((unsigned int)(pDestMain), (void*)G_SPU_ZERO16, 16, USER_DMA_TAG_BASE+1);})

#define __spu_zero_mem16_no_cache_no_sync$VM_ __spu_zero_mem16_no_cache_no_sync

//transfers 1,2,4,8 byte to main memory without any cache awareness or syncing
#define __spu_zero_mem_no_cache_no_sync(pDestMain, cSize, cFenced, cSyncPointID)({\
	assert(cSize<16);\
	if(cFenced)MemcpyMainFenced((unsigned int)(pDestMain),(void*)(G_SPU_ZERO16+((unsigned int)pDestMain&15)), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID));\
	else MemcpyMain((unsigned int)(pDestMain),(void*)(G_SPU_ZERO16+((unsigned int)pDestMain&15)), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID));})

#define __spu_zero_mem_no_cache_no_sync$VM_ __spu_zero_mem_no_cache_no_sync

//------------------------------------cache flushing-----------------------------------------------------

#define __spu_invalidate_cache_prefetches()({\
	*(vec_uint4* __restrict)G_SPU_CACHE_PREF_DIR_ADDR			= spu_splats((unsigned int)0);\
	*(vec_uint4* __restrict)G_SPU_CACHE_PREF_LRU_DIR_ADDR = spu_splats((unsigned int)0);})

#define __spu_flush_cache_range(cEAFrom, cSize)({\
	FlushCacheRange((unsigned int)cEAFrom, (unsigned int)cSize, true);\
	__spu_invalidate_cache_prefetches();})

#define __spu_flush_cache_line(cEA)({\
	FlushCacheRange((unsigned int)cEA, 128, true);\
	__spu_invalidate_cache_prefetches();})

#define __spu_invalidate_cache_line(cEA)({\
	FlushCacheRange((unsigned int)cEA, 128, false);\
	__spu_invalidate_cache_prefetches();})

#define __spu_invalidate_cache_range(cEAFrom, cSize)({\
	FlushCacheRange((unsigned int)cEAFrom, cSize, false);\
	__spu_invalidate_cache_prefetches();})

__attribute__((always_inline))
inline void __spu_flush_cache()
{
#if defined(DO_SPU_PROFILING)
	SPUFlushCache(true, false);
#else
	SPUFlushCache(true);
#endif
	//clear LRU and cache dir
	const vec_uint4 cZero = spu_splats((unsigned int)0);
	const int cNumSets = *(int*)G_SPU_NUM_SETS;
	//reset cache dir entries, 4 at once to give the branch hint a chance to be set
	for(unsigned int s=0; s<cNumSets; s += 4)
	{
		g_pSPUCacheLRUCtrl[s]		= cZero;
		g_pSPUCacheDir[s]				= cZero;
		g_pSPUCacheLRUCtrl[s+1] = cZero;
		g_pSPUCacheDir[s+1]			= cZero;
		g_pSPUCacheLRUCtrl[s+2] = cZero;
		g_pSPUCacheDir[s+2]			= cZero;
		g_pSPUCacheLRUCtrl[s+3] = cZero;
		g_pSPUCacheDir[s+3]			= cZero;
	}
}

#if defined(ENABLE_HAZARD_MODE) || !defined(_NO_SPU_CACHE_HAZ_CHECK)
	#define spu_CheckCacheHazard(cond) \
	do \
	{ \
		if (__builtin_expect(!(cond), 0)) \
		{ \
			printf("%s cache hazard: (%s) in %s line %d\n",SPU_ASSERT_STRING, #cond, __FILE__, __LINE__); \
			SPU_DEBUG_HALT; \
		} \
	}\
	while (false)
#else
	#define spu_CheckCacheHazard(cond) do{}while (false)
#endif //ENABLE_HAZARD_MODE

#define __spu_dma_pref(cEA)	SPUDMAPref((unsigned int)cEA)
__attribute__((always_inline))
inline void __spu_dma_pref_inl(const unsigned int cEA)
{
	spu_CheckCacheHazard(cEA > (unsigned int)256 * 1024);
	const unsigned int cAllocatedEA							= cEA & ~scSPUCacheLineSizeMask;
	const vec_uint4 cAllocatedEA4								= spu_splats(cAllocatedEA);
	vec_uint4* const __restrict pPrefLRUDir			= (vec_uint4* __restrict)G_SPU_CACHE_PREF_LRU_DIR_ADDR;
	vec_uint4* const __restrict pPrefDir				= (vec_uint4* __restrict)G_SPU_CACHE_PREF_DIR_ADDR;
	const vec_uint4 cCurPrefLRUDirCont					= *pPrefLRUDir;
	IF(spu_extract(spu_gather(spu_and(*pPrefDir, cAllocatedEA4)),0) != 0, 0)//already present
		return;
	const unsigned int cLRUReplIndex						= NSPU::GetReplIndex(cCurPrefLRUDirCont);
	const vec_uint4 cLRUReplMask								= spu_insert(0xFFFFFFFF, (const vec_uint4){0}, cLRUReplIndex);
	si_wrch(MFC_LSA,si_from_uint(G_SPU_CACHE_PREF_BUF_ADDR + (cLRUReplIndex << scSPUCacheLineSizeShift)));
	//		si_wrch(MFC_EAH,si_from_uint((uint64)(cEA)>>32));//this should not be necessary at all, but it becomes very slow otherwise
	*pPrefLRUDir																= spu_sel(cCurPrefLRUDirCont, spu_add(g_LRUCounter, 8), cLRUReplMask);
	si_wrch(MFC_EAL,si_from_uint(cAllocatedEA));
	*pPrefDir																		= spu_sel(*pPrefDir, cAllocatedEA4, cLRUReplMask);
	si_wrch(MFC_Size,si_from_uint(scSPUCacheLineSize));
	si_wrch(MFC_TagID,si_from_uint(GetPrefetchTagID(cLRUReplIndex)));
	si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD));//toggle prefetch
}

//------------------------------------cache touch-----------------------------------------------------

//touch function to update the LRU counter for an existing cache line (cpSPUPtr = cached SPU ptr)
//determine the set and index within by applying some arithmetic on the SPU address
__attribute__((always_inline))
inline void __spu_cache_touch(const uint8* const __restrict cpSPUPtr, const unsigned int cLRUIncr)
{
	unsigned int *const __restrict pLRUSlot = 
		(unsigned int*)((((unsigned int)cpSPUPtr & ~scSPUCacheLineSizeMask) >> (scSPUCacheLineSizeShift-2)) + 
		((unsigned int)g_pSPUCacheLRUCtrl - ((unsigned int)g_pSPUCache >> (scSPUCacheLineSizeShift-2))));
	const vec_uint4 cNewLRUCntr			  = spu_add(g_LRUCounter, cLRUIncr);
	g_LRUCounter = spu_add(g_LRUCounter, 1);
	*pLRUSlot		 = spu_extract(cNewLRUCntr, 0);
};

__attribute__((always_inline))
inline void __spu_cache_touch_ex(const unsigned int cPrepConst, const unsigned int cLRUIncr)
{
	unsigned int *const __restrict pLRUSlot = (unsigned int*)cPrepConst;
	const vec_uint4 cNewLRUCntr			  = spu_add(g_LRUCounter, cLRUIncr);
	g_LRUCounter = spu_add(g_LRUCounter, 1);
	*pLRUSlot		 = spu_extract(cNewLRUCntr, 0);
};

__attribute__((always_inline))
inline const unsigned int __spu_cache_prep_touch(const uint8* const __restrict cpSPUPtr, const unsigned int cTouchConst)
{
	return ((((unsigned int)cpSPUPtr & ~scSPUCacheLineSizeMask) >> (scSPUCacheLineSizeShift-2)) + cTouchConst);
}

__attribute__((always_inline))
inline const unsigned int __spu_cache_init_touch()
{
	return ((unsigned int)g_pSPUCacheLRUCtrl - ((unsigned int)g_pSPUCache >> (scSPUCacheLineSizeShift-2)));
}

//------------------------------------cache lookup funcs-----------------------------------------------------

#define __spu_cache_incr_lru_cntrl(cIncr)({\
	g_LRUCounter = spu_add(g_LRUCounter, (unsigned int)cIncr);})

//cache line selection function which selects a cache line from 2 already cached, 
//	in PPU memory consecutive cache lines
__attribute__((always_inline))
inline void* __spu_cache_select
(
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const unsigned int cpPPUPtrOff											//PPU offset relative to the corresponding PPU address of the first cache line
)
{
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask		= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr, 0), spu_promote((unsigned int)127, 0));
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0, 0), spu_promote(cAlignedSPUExist1, 0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

#if defined(DO_SPU_PROFILING)
	__attribute__((always_inline))
	inline void* __spu_cache_lookup_prof(const unsigned int cEA, const unsigned int cLRUIncr, const unsigned int cProfID, const int cPrefDiff)
	{
//		assert(cProfID < MAX_PROF_ID);
		g_ProfID = spu_promote(cProfID,0);
		return __spu_cache_lookup(cEA, cLRUIncr, cPrefDiff);
	}

	__attribute__((always_inline))
	inline void* __spu_cache_lookup_prof_volatile(const unsigned int cEA, const unsigned int cLRUIncr, const unsigned int cProfID, const int cPrefDiff)
	{
//		assert(cProfID < MAX_PROF_ID);
		g_ProfID = spu_promote(cProfID,0);
		return DoVolatileCacheLookup(cEA, cLRUIncr, cPrefDiff);
	}

	#define __cache_lookup(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		spu_CheckCacheHazard(PROFID < MAX_PROF_ID);\
		RESULT = __spu_cache_lookup_prof((unsigned int)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

	#define __cache_lookup_ex(RESULT, SPU_PTR_EXIST, PPU_PTR_EXIST, OFFSET_EXIST, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((unsigned int)SPU_PTR_EXIST < (unsigned int)(256 * 1024));\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		spu_CheckCacheHazard(PROFID < MAX_PROF_ID);\
		const uint8_t *const _PPU_PTR_EXIST = \
		(const uint8_t *)(PPU_PTR_EXIST) + (OFFSET_EXIST) / 8; \
		RESULT = __spu_cache_lookup_ex((const uint8_t *const)SPU_PTR_EXIST, _PPU_PTR_EXIST, (const uint8* const __restrict)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

	#define __cache_select(RESULT, SPU_PTR_EXIST0, SPU_PTR_EXIST1, OFFSET, PROFID) ({ \
		STORE_LINE_NUM \
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST0);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST1);\
		RESULT = __spu_cache_select((const uint8_t *const)SPU_PTR_EXIST0, (const uint8_t *const)SPU_PTR_EXIST1, OFFSET/8); })

	#define __cache_lookup_volatile(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		spu_CheckCacheHazard(PROFID < MAX_PROF_ID);\
		RESULT = __spu_cache_lookup_prof_volatile((unsigned int)_PTR, LRU_INCR, PROFID, PREF_DIFF); })

#else

	#define __cache_lookup(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		RESULT = __spu_cache_lookup(_PTR, LRU_INCR, PREF_DIFF);\
		spu_CheckCacheHazard((unsigned int)(RESULT) < (unsigned int)(256 * 1024)); })

	#define __cache_lookup_miss(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		RESULT = __spu_cache_lookup_miss(_PTR, LRU_INCR, PREF_DIFF);\
		spu_CheckCacheHazard((unsigned int)RESULT < (unsigned int)(256 * 1024)); })

	#define __cache_lookup_ex(RESULT, SPU_PTR_EXIST, PPU_PTR_EXIST, OFFSET_EXIST, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((unsigned int)SPU_PTR_EXIST < (unsigned int)(256 * 1024));\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		const uint8_t *const _PPU_PTR_EXIST = \
		(const uint8_t *)(PPU_PTR_EXIST) + (OFFSET_EXIST) / 8; \
		RESULT = __spu_cache_lookup_ex((const uint8_t *const)SPU_PTR_EXIST, _PPU_PTR_EXIST, (const uint8* const __restrict)_PTR, LRU_INCR, PROFID, PREF_DIFF);\
		spu_CheckCacheHazard((unsigned int)RESULT < (unsigned int)(256 * 1024)); })

	#define __cache_select(RESULT, SPU_PTR_EXIST0, SPU_PTR_EXIST1, OFFSET, PROFID) ({ \
		STORE_LINE_NUM \
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST0);\
		CheckIsPresent((ptrdiff_t)SPU_PTR_EXIST1);\
		RESULT = __spu_cache_select((const uint8_t *const)SPU_PTR_EXIST0, (const uint8_t *const)SPU_PTR_EXIST1, OFFSET/8); })

	#define __cache_lookup_volatile(RESULT, PTR, OFFSET, LRU_INCR, PREF_DIFF, PROFID) ({ \
		STORE_LINE_NUM \
		const uint8_t *const _PTR = \
		(const uint8_t *)(PTR) + (OFFSET) / 8; \
		spu_CheckCacheHazard((ptrdiff_t)_PTR != 0);\
		spu_CheckCacheHazard((unsigned int)_PTR > (unsigned int)(256 * 1024));\
		RESULT = DoVolatileCacheLookup((unsigned int)_PTR, (unsigned int)LRU_INCR, (unsigned int)PREF_DIFF);\
		spu_CheckCacheHazard((unsigned int)(RESULT) < (unsigned int)(256 * 1024)); })
#endif //DO_SPU_PROFILING

#if defined(DO_SPU_PROFILING)
	#define __cache_reload_line(cpSPUPtr, cpPPUPtr, cPPUOff, PROFID) ({ \
		STORE_LINE_NUM \
		assert((PROFID) < MAX_PROF_ID);g_ProfID = spu_promote((unsigned int)(PROFID),0);\
		ReloadCacheLine((unsigned int)(void*)cpSPUPtr, (unsigned int)(void*)cpPPUPtr + cPPUOff/8); })
	#define __cache_write_line(cpSPUPtr, cpPPUPtr, cPPUOff, PROFID) ({ \
		STORE_LINE_NUM \
		assert((PROFID) < MAX_PROF_ID);g_ProfID = spu_promote((unsigned int)(PROFID),0);\
		WriteBackCacheLine((unsigned int)(void*)cpSPUPtr, (unsigned int)(void*)cpPPUPtr + cPPUOff/8); })
#else
	#define __cache_reload_line(cpSPUPtr, cpPPUPtr, cPPUOff, PROFID)\
		STORE_LINE_NUM \
		ReloadCacheLine((unsigned int)(void*)cpSPUPtr, (unsigned int)(void*)cpPPUPtr + cPPUOff/8)
	#define __cache_write_line(cpSPUPtr, cpPPUPtr, cPPUOff, PROFID)\
		STORE_LINE_NUM \
		WriteBackCacheLine((unsigned int)(void*)cpSPUPtr, (unsigned int)(void*)cpPPUPtr + cPPUOff/8)
#endif

//extended cache lookup function which checks first if cpPtr (PPU pointer) lies in the same cache line specified by
//  cpSPUPtrExist (cpPPUPtrExist is corresponding PPU pointer)
//keep in sync with definition in SPUMemManager_spu.h
__attribute__((always_inline))
inline void* __spu_cache_lookup_ex
(
	const uint8* const __restrict cpSPUPtrExist, 
	const uint8* const __restrict cpPPUPtrExist,
	const uint8* const __restrict cpPtr,
	const unsigned int cLRUIncr,
	const unsigned int cProfID,
	const int cPrefDiff
)
{
	//check for valid pointer
	//	spu_CheckCacheHazard((unsigned int)cpSPUPtrExist >= (unsigned int)g_pSPUCache);
	//	CheckIsPresent(cpSPUPtrExist);
	const unsigned int cAlignedPPUEA		 = (unsigned int)cpPPUPtrExist & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedPPtrPPUEA = (unsigned int)cpPtr & ~scSPUCacheLineSizeMask;
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
	{
		const unsigned int cAlignedSPUEA		 = (unsigned int)cpSPUPtrExist & ~scSPUCacheLineSizeMask;
		const unsigned int cOffsetPPUPtr		 = (unsigned int)cpPtr & scSPUCacheLineSizeMask;
		return (void*)(cAlignedSPUEA + cOffsetPPUPtr);
	}
#if defined(DO_SPU_PROFILING)
	return __spu_cache_lookup_prof((unsigned int)cpPtr, cLRUIncr, cProfID, cPrefDiff);
#else
	return __spu_cache_lookup((unsigned int)cpPtr, cLRUIncr, cPrefDiff);
#endif
}

__attribute__((always_inline))
inline void* __spu_cache_lookup0_select
(
	const uint8* const __restrict cpPPUPTR,				//ppu pointer of first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const unsigned int cLRUIncr,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist1);
	spu_CheckCacheHazard((unsigned int)cpPPUPTR > (unsigned int)(256 * 1024));
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpPPUPTR & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr, 0), spu_promote((unsigned int)127, 0));
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR, cLRUIncr, cProfID, cPrefDiff);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR, cLRUIncr, cPrefDiff);
#endif
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)(*cppSPUPtrExist0) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup_ex for the first cache line in relation to the second one
__attribute__((always_inline))
inline void* __spu_cache_lookup0_ex_select
(
	const uint8* const __restrict cpPPUPTR,				//ppu pointer of first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist1,	//SPU address of second cache line
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const unsigned int cDiff,														//pointer diff between both cache lines
	const unsigned int cLRUIncr,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist1);
	spu_CheckCacheHazard((unsigned int)cpPPUPTR > (unsigned int)(256 * 1024));

	//call to __spu_cache_lookup_ex is inlined and interleaved due to compiler error (prevention)
/*
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpPPUPTR & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_ex(cpSPUPtrExist1, cpPPUPTR+cDiff, cpPPUPTR, cLRUIncr, cProfID, cPrefDiff);
*/
	const unsigned int cAlignedPPUEA									= (unsigned int)(cpPPUPTR+cDiff) & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedPPtrPPUEA							= (unsigned int)cpPPUPTR & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpPPUPTR & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
	{
		const unsigned int cAlignedSPUEA	= (unsigned int)cpSPUPtrExist1 & ~scSPUCacheLineSizeMask;
		const unsigned int cOffsetPPUPtr	= (unsigned int)cpPPUPTR & scSPUCacheLineSizeMask;
		*cppSPUPtrExist0						=  (uint8*)(cAlignedSPUEA + cOffsetPPUPtr);
	}
	else
#if defined(DO_SPU_PROFILING)
		*cppSPUPtrExist0						=  (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR, cLRUIncr, cProfID, cPrefDiff);
#else
		*cppSPUPtrExist0						=  (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR, cLRUIncr, cPrefDiff);
#endif

	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)(*cppSPUPtrExist0) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

__attribute__((always_inline))
inline void* __spu_cache_lookup1_select
(
	const uint8* const __restrict cpPPUPTR1,			//ppu pointer of second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const unsigned int cLRUIncr,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff
)
{
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist0);
	spu_CheckCacheHazard((unsigned int)cpPPUPTR1 > (unsigned int)(256 * 1024));
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR1, cLRUIncr, cProfID, cPrefDiff);
#else
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR1, cLRUIncr, cPrefDiff);
#endif
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)(*cppSPUPtrExist1) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup_ex for the second cache line in relation to the first one
__attribute__((always_inline))
inline void* __spu_cache_lookup1_ex_select
(
	const uint8* const __restrict cpPPUPTR1,			//ppu pointer of second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const uint8* const __restrict cpSPUPtrExist0,	//SPU address of first cache line
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const uint8* const __restrict cpPPUPTR0,			//ppu pointer of first cache line
	const unsigned int cLRUIncr,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff
)
{
	//__spu_cache_lookup_ex is inlined to avoid a compiler bug
	CheckIsPresent((ptrdiff_t)cpSPUPtrExist0);
	spu_CheckCacheHazard((unsigned int)cpPPUPTR1 > (unsigned int)(256 * 1024));
/*	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_ex(cpSPUPtrExist0, cpPPUPTR0, cpPPUPTR1, cLRUIncr, cProfID, cPrefDiff);
*/
	const unsigned int cAlignedPPUEA		 = (unsigned int)cpPPUPTR0 & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedPPtrPPUEA = (unsigned int)cpPPUPTR1 & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpSPUPtrExist0 & scSPUCacheLineSizeMask);
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
	{
		const unsigned int cAlignedSPUEA		 = (unsigned int)cpSPUPtrExist0 & ~scSPUCacheLineSizeMask;
		const unsigned int cOffsetPPUPtr		 = (unsigned int)cpPPUPTR1 & scSPUCacheLineSizeMask;
		*cppSPUPtrExist1 = (uint8*)(cAlignedSPUEA + cOffsetPPUPtr);
	}
	else
	#if defined(DO_SPU_PROFILING)
		*cppSPUPtrExist1 = (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR1, cLRUIncr, cProfID, cPrefDiff);
	#else
		*cppSPUPtrExist1 = (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR1, cLRUIncr, cPrefDiff);
	#endif

	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)(*cppSPUPtrExist1) & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

__attribute__((always_inline))
inline void* __spu_cache_lookup01_select
(
	const uint8* const __restrict cpPPUPTR0,			//PPU address for first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpPPUPTR1,			//PPU address for second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const unsigned int cLRUIncr0,
	const unsigned int cLRUIncr1,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff0,
	const int cPrefDiff1
)
{
	spu_CheckCacheHazard((unsigned int)cpPPUPTR0 > (unsigned int)(256 * 1024));
	spu_CheckCacheHazard((unsigned int)cpPPUPTR1 > (unsigned int)(256 * 1024));
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpPPUPTR0 & scSPUCacheLineSizeMask);
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR0, cLRUIncr0, cProfID, cPrefDiff0);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR0, cLRUIncr0, cPrefDiff0);
#endif
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR1, cLRUIncr1, cProfID, cPrefDiff1);
#else
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR1, cLRUIncr1, cPrefDiff1);
#endif
	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)*cppSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

//performs a __spu_cache_lookup for the first cache line and a 
//	__spu_cache_lookup_ex for the second cache line in relation to the first one
__attribute__((always_inline))
inline void* __spu_cache_lookup01_ex_select
(
	const uint8* const __restrict cpPPUPTR0,			//PPU address for first cache line
	const uint8** __restrict cppSPUPtrExist0,			//pointer to SPU address of first cache line (to obtain)
	const uint8* const __restrict cpPPUPTR1,			//PPU address for second cache line
	const uint8** __restrict cppSPUPtrExist1,			//pointer to SPU address of second cache line (to obtain)
	const unsigned int cpPPUPtrOff,											//PPU offset relative to the corresponding PPU address of the first cache line
	const unsigned int cLRUIncr0,
	const unsigned int cLRUIncr1,
	const unsigned int cProfID,													//profiling ID
	const int cPrefDiff0,
	const int cPrefDiff1
)
{
	spu_CheckCacheHazard((unsigned int)cpPPUPTR0 > (unsigned int)(256 * 1024));
	spu_CheckCacheHazard((unsigned int)cpPPUPTR1 > (unsigned int)(256 * 1024));
	const unsigned int cOffsetQueryPPUPtr _ALIGN(16)	= cpPPUPtrOff + ((unsigned int)cpPPUPTR0 & scSPUCacheLineSizeMask);
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR0, cLRUIncr0, cProfID, cPrefDiff0);
#else
	*cppSPUPtrExist0														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR0, cLRUIncr0, cPrefDiff0);
#endif
	//call to __spu_cache_lookup_ex is inlined and interleaved due to compiler error (prevention)
/*
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_ex(*cppSPUPtrExist0, cpPPUPTR0, cpPPUPTR1, cLRUIncr1, cProfID, cPrefDiff1);
*/
	const unsigned int cAlignedPPUEA									= (unsigned int)cpPPUPTR0 & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedPPtrPPUEA							= (unsigned int)cpPPUPTR1 & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelMask										= spu_cmpgt(spu_promote(cOffsetQueryPPUPtr,0), spu_promote((unsigned int)127,0));
	const unsigned int cAlignedSPUExist0 _ALIGN(16)		= (unsigned int)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
	{
		const unsigned int cAlignedSPUEA								= (unsigned int)*cppSPUPtrExist0 & ~scSPUCacheLineSizeMask;
		const unsigned int cOffsetPPUPtr								= (unsigned int)cpPPUPTR1 & scSPUCacheLineSizeMask;
		*cppSPUPtrExist1													= (uint8*)(cAlignedSPUEA + cOffsetPPUPtr);
	}
#if defined(DO_SPU_PROFILING)
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup_prof((unsigned int)cpPPUPTR1, cLRUIncr1, cProfID, cPrefDiff1);
#else
	*cppSPUPtrExist1														= (uint8*)__spu_cache_lookup((unsigned int)cpPPUPTR1, cLRUIncr1, cPrefDiff1);
#endif

	const unsigned int cAlignedSPUExist1 _ALIGN(16)		= (unsigned int)*cppSPUPtrExist1 & ~scSPUCacheLineSizeMask;
	const vec_uint4 cSelSPUAddr = spu_sel(spu_promote(cAlignedSPUExist0,0), spu_promote(cAlignedSPUExist1,0), cSelMask);
	return (void*)((cOffsetQueryPPUPtr & scSPUCacheLineSizeMask) + spu_extract(cSelSPUAddr, 0));
}

#define __cache_interpolate(RESULT, LOCAL0, LOCAL1, OFFSET, DIFF, MEMREF_ID) ({ \
	__cache_select(RESULT, LOCAL0, LOCAL1, OFFSET, MEMREF_ID); })

#define __cache_lookup0_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	RESULT = __spu_cache_lookup0_select(_PTR, (const uint8** __restrict)&(LOCAL0), (const uint8* const __restrict)LOCAL1, OFFSET/8, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup0_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	RESULT = __spu_cache_lookup0_ex_select(_PTR, (const uint8** __restrict)&(LOCAL0), (const uint8* const __restrict)LOCAL1, OFFSET/8, DIFF/8, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup1_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup1_select(_PTR1, (const uint8**)&(LOCAL1), (const uint8* const __restrict)LOCAL0, OFFSET/8, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup1_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR, PREF_DIFF, MEMREF_ID) ({ \
		const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
		const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
		RESULT = __spu_cache_lookup1_ex_select(_PTR1, (const uint8** __restrict)&(LOCAL1), (const uint8* const __restrict)LOCAL0, OFFSET/8, _PTR0, LRU_INCR, MEMREF_ID, PREF_DIFF); })

#define __cache_lookup01_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR0, LRU_INCR1, PREF_DIFF0, PREF_DIFF1, MEMREF_ID) ({ \
	const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup01_select(_PTR0, (const uint8** __restrict)&(LOCAL0), _PTR1, (const uint8** __restrict)&(LOCAL1), OFFSET/8, LRU_INCR0, LRU_INCR1, MEMREF_ID, PREF_DIFF0, PREF_DIFF1); })

#define __cache_lookup01_ex_interpolate(RESULT, MAIN, MAIN_OFFSET, LOCAL0, LOCAL1, OFFSET, DIFF, LRU_INCR0, LRU_INCR1, PREF_DIFF0, PREF_DIFF1, MEMREF_ID) ({ \
	const uint8_t *const _PTR0 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET)) / 8; \
	const uint8_t *const _PTR1 = (const uint8_t *)(MAIN) + ((MAIN_OFFSET) - (OFFSET) + (DIFF)) / 8; \
	RESULT = __spu_cache_lookup01_ex_select(_PTR0, (const uint8** __restrict)&(LOCAL0), _PTR1, (const uint8** __restrict)&(LOCAL1), OFFSET/8, LRU_INCR0, LRU_INCR1, MEMREF_ID, PREF_DIFF0, PREF_DIFF1); })

#define __cache_touch(PTR, OFFSET, LRU_INCR, PROFID) ({ \
	const uint8_t *const _PTR = \
	(const uint8_t *)(PTR) + (OFFSET) / 8; \
	spu_CheckCacheHazard((unsigned int)_PTR >= (unsigned int)g_pSPUCache);\
	CheckIsPresent((unsigned int)_PTR);\
	__spu_cache_touch((const uint8* const)_PTR, LRU_INCR); })

#define __cache_touch_prep(PTR, OFFSET, TOUCH_CONST) ({ \
	const uint8_t *const _PTR = \
	(const uint8_t *)(PTR) + (OFFSET) / 8; \
	__spu_cache_prep_touch((const uint8* const)_PTR, TOUCH_CONST); })

#define __cache_touch_ex(ADDR, LRU_INCR, PROFID) __spu_cache_touch_ex(ADDR, LRU_INCR)

#define __cache_range_write_async(cpFrom, cpTo) SPUAddCacheWriteRangeAsync((unsigned int)(cpFrom), (unsigned int)(cpTo))

__attribute__((always_inline))
inline void SPUAddCacheWriteRangeAsync(const unsigned int cEAFrom, const unsigned int cEATo)
{
	vec_uint4 *const __restrict pAsyncRangesDirFrom = (vec_uint4*)G_SPU_CACHE_ASYNC_RANGES_DIR_FROM;
	vec_uint4 *const __restrict pAsyncRangesDirTo		= (vec_uint4*)G_SPU_CACHE_ASYNC_RANGES_DIR_TO;
	vec_uint4 curAsyncRangesDirFrom = *pAsyncRangesDirFrom;
	vec_uint4 curAsyncRangesDirTo		= *pAsyncRangesDirTo;

	//start at next cache line boundary (unsafe to start within a cacheline)
	const unsigned int cEAAlignedFrom = (cEAFrom + scSPUCacheLineSizeMask) & ~scSPUCacheLineSizeMask;
	const unsigned int cEAAlignedTo		= (cEATo + scSPUCacheLineSizeMask) & ~scSPUCacheLineSizeMask;
	//rotate existing one 4 bytes and insert into slot 0
	curAsyncRangesDirFrom	= spu_rlqwbyte(curAsyncRangesDirFrom, 4);
	curAsyncRangesDirTo		= spu_rlqwbyte(curAsyncRangesDirTo, 4);
	curAsyncRangesDirFrom	= spu_insert(cEAAlignedFrom, curAsyncRangesDirFrom, 0);
	curAsyncRangesDirTo		= spu_insert(cEAAlignedTo, curAsyncRangesDirTo, 0);

	*pAsyncRangesDirFrom	= curAsyncRangesDirFrom;
	*pAsyncRangesDirTo		= curAsyncRangesDirTo;
};

//------------------------------------memory transfer ops---------------------------------------------------

#define memtransfer_from_main(pDest, pSrc, cSize, cSyncPointID/*0..13*/)({\
	assert(((unsigned int)(pSrc) & 15) == ((unsigned int)(pDest) & 15));\
	assert((cSyncPointID) < USER_DMA_TAG_CNT - 2);\
	MemcpyLargeLS((unsigned int)(pDest), (void*)(pSrc), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID), false);})
#define memtransfer_from_main$VLM_ memtransfer_from_main

#define memtransfer_to_main(pDest, pSrc, cSize, cSyncPointID/*0..13*/)({\
	assert(((unsigned int)(pSrc) & 15) == ((unsigned int)(pDest) & 15));\
	assert((cSyncPointID) < USER_DMA_TAG_CNT - 2);\
	MemcpyLargeMain((unsigned int)(pDest), (void*)(pSrc), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID), false);})
#define memtransfer_to_main$VML_ memtransfer_to_main

#define memtransfer_from_main_fenced(pDest, pSrc, cSize, cSyncPointID/*0..13*/)({\
	assert(((unsigned int)(pSrc) & 15) == ((unsigned int)(pDest) & 15));\
	assert((cSyncPointID) < USER_DMA_TAG_CNT - 2);\
	MemcpyLargeLS((unsigned int)(pDest), (void*)(pSrc), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID), true);})
#define memtransfer_from_main_fenced$VLM_ memtransfer_from_main_fenced

#define memtransfer_to_main_fenced(pDest, pSrc, cSize, cSyncPointID/*0..13*/)({\
	assert(((unsigned int)(pSrc) & 15) == ((unsigned int)(pDest) & 15));\
	assert((cSyncPointID) < USER_DMA_TAG_CNT - 2);\
	MemcpyLargeMain((unsigned int)(pDest), (void*)(pSrc), cSize, MEM_TRANSFER_DMA_TAG_BASE + (cSyncPointID), true);})
#define memtransfer_to_main_fenced$VML_ memtransfer_to_main_fenced

__attribute__((always_inline))
inline bool memtransfer_pending(const unsigned int cTagID)
{
	const unsigned int cTagMask = (1<<(cTagID));
	spu_writech(MFC_WrTagMask, cTagMask);
	si_wrch(MFC_WrTagUpdate,si_from_uint(MFC_TAG_UPDATE_IMMEDIATE));
	return (si_to_uint(si_rdch(MFC_RdTagStat)) != cTagMask);
}

#define memtransfer_sync(cSyncPointID/*0..13*/)({\
	assert((cSyncPointID) < USER_DMA_TAG_CNT - 2);\
	SyncMemory(MEM_TRANSFER_DMA_TAG_BASE + cSyncPointID);})

//------------------------------------memory allocation---------------------------------------------------

#undef malloc
#undef realloc
#undef free
#undef reallocalign
#undef memalign
#undef calloc
#define memalign CryMemAlign
#define reallocalign CryModuleReallocAlign
#define malloc  CryModuleMalloc
#define calloc  CryModuleCalloc
#define realloc CryModuleRealloc
#define free    CryModuleFree
#define realloc$VM_ CryModuleRealloc
#define free$VM_ CryModuleFree

#if !defined(JOB_LIB_COMP)
namespace std
{
#endif
	__attribute__((always_inline))
	inline void* CryMemAlign(const size_t cBoundary, const size_t cSize)
	{
		void *pPtr = NULL;
		PPUAlloc_func(pPtr, (cSize + (cBoundary-1)) & ~(cBoundary-1));
		assert(((unsigned int)pPtr & (cBoundary-1)) == 0);
		return pPtr;
	}

	__attribute__((always_inline))
	inline void *CryModuleMemalign(const size_t cSize, const size_t cAlign)
	{
		void *pPtr = NULL;
		PPUAlloc_func(pPtr, (cSize + (cAlign - 1)) & ~(cAlign - 1));
		assert(((unsigned int)pPtr & (cAlign - 1)) == 0);
		return pPtr;
	}

	__attribute__((always_inline))
	inline void CryModuleMemalignFree(void *pPtr)
	{
		PPUFree_func(pPtr);
	}
	#define CryModuleMemalignFree$VM_ CryModuleMemalignFree

#if !defined(JOB_LIB_COMP)
}
using std::CryMemAlign;
#endif
#define memalign CryMemAlign

#if !defined(JOB_LIB_COMP)
namespace std
{
#endif
	// These must live in the std namespace to make sure the #defines above also
	// work for std::malloc() etc.

	__attribute__((always_inline))
	inline void* CryModuleMalloc(const size_t cSize) throw()
	{
		void *pPtr = NULL;
		PPUAlloc_func(pPtr, cSize);
		return pPtr;
	};

	__attribute__((always_inline))
	inline void* CryModuleCalloc(const size_t cSize, const size_t cNum) throw()
	{
		void *pPtr = NULL;
		PPUCalloc_func(pPtr, cSize, cNum);
		return pPtr;
	};

	__attribute__((always_inline))
	inline void* CryModuleReallocAlign(void *pPtr, const size_t cSize, const size_t cBoundary, ECryModule)  throw()
	{
		void *pNewPtr = CryMemAlign(cBoundary, cSize);
		if(pPtr)
		{
			memcpy$VMM_(pNewPtr, pPtr, cSize);//due to unknown source size, copy conservative
			PPUFree_func(pPtr);
		}
		return pNewPtr;
	};

	__attribute__((always_inline))
	inline void* CryModuleReallocAlign(void *pPtr, const size_t cSize, const size_t cBoundary)  throw()
	{
		return CryModuleReallocAlign(pPtr, cSize, cBoundary, eCryM_Launcher);
	};

	__attribute__((always_inline))
	inline void* CryModuleRealloc(void *pPtr, const size_t cSize)  throw()
	{
		void *pNewPtr = NULL;
		PPUAlloc_func(pNewPtr, cSize);
		if(pPtr)
		{
			memcpy$VMM_(pNewPtr, pPtr, cSize);//due to unknown source size, copy conservative
			PPUFree_func(pPtr);
		}
		return pNewPtr;
	};

	__attribute__((always_inline))
	inline size_t CryModuleFree(void *pPtr) throw()
	{
		PPUFree_func(pPtr);
		return 0;
	};

	__attribute__((always_inline))
	inline void* CryModuleMalloc(const size_t cSize, ECryModule) throw()
	{
		void *pPtr = NULL;
		PPUAlloc_func(pPtr, cSize);
		return pPtr;
	};

	__attribute__((always_inline))
	inline void* CryModuleCalloc(const size_t cSize, const size_t cNum, ECryModule) throw()
	{
		void *pPtr = NULL;
		PPUCalloc_func(pPtr, cSize, cNum);
		return pPtr;
	};

	__attribute__((always_inline))
	inline void* CryModuleRealloc(void *pPtr, const size_t cSize, ECryModule)  throw()
	{
		PPUFree_func(pPtr);
		PPUAlloc_func(pPtr, cSize);
		return pPtr;
	};

	__attribute__((always_inline))
	inline void CryModuleFree(void *pPtr, ECryModule) throw()
	{
		PPUFree_func(pPtr);
	};
#if !defined(JOB_LIB_COMP)
	#define CryModuleRealloc$VM_ CryModuleRealloc
	#define CryModuleFree$VM_ CryModuleFree
	#define reallocalign$VM_ CryModuleReallocAlign
}
#endif

__attribute__((always_inline))
inline void * operator new(size_t cSize) throw (std::bad_alloc) 
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void* operator new (size_t cSize, const std::nothrow_t &nothrow) throw()
{
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}
#define $operator_new operator new
#define __operator_new operator new

__attribute__((always_inline))
inline void* operator new[](size_t cSize) throw (std::bad_alloc) 
{ 
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}
#define $operator_array_new operator new[]
#define __operator_array_new operator new[]

__attribute__((always_inline))
inline void* operator new[] (size_t cSize, const std::nothrow_t &nothrow) throw()
{ 
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSize);
	return pPtr;
}

__attribute__((always_inline))
inline void operator delete(void *pPtr) throw()
{
	PPUFree_func(pPtr); 
}
#define $operator_delete$VM_ operator delete

__attribute__((always_inline))
inline void operator delete(void *pPtr, const std::nothrow_t&) throw()
{
	PPUFree_func(pPtr); 
}

__attribute__((always_inline))
inline void operator delete[](void *pPtr) throw()
{ 
	PPUFree_func(pPtr);
}
#define $operator_array_delete$VM_ operator delete[]
#define $operator_array_delete$VL_ operator delete[]

__attribute__((always_inline))
inline void operator delete[](void *pPtr, const std::nothrow_t&) throw()
{ 
	PPUFree_func(pPtr);
}

__attribute__((always_inline))
inline void *operator new(_CSTD size_t cSize, _CSTD size_t cAlignment) throw (std::bad_alloc)
{
	assert(cAlignment <= 128);
	const unsigned int cSizeAligned = cSize;//CondSelMax((unsigned int)cSize, (unsigned int)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new(_CSTD size_t cSize, _CSTD size_t cAlignment, const std::nothrow_t&) throw()
{
	assert(cAlignment <= 128);
	const unsigned int cSizeAligned = cSize;//CondSelMax((unsigned int)cSize, (unsigned int)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new[](_CSTD size_t cSize, _CSTD size_t cAlignment) throw (std::bad_alloc)
{
	assert(cAlignment <= 128);
	const unsigned int cSizeAligned = cSize;//CondSelMax((unsigned int)cSize, (unsigned int)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

__attribute__((always_inline))
inline void *operator new[](_CSTD size_t cSize, _CSTD size_t cAlignment, const std::nothrow_t&) throw()
{
	assert(cAlignment <= 128);
	const unsigned int cSizeAligned = cSize;//CondSelMax((unsigned int)cSize, (unsigned int)cAlignment);
	void *pPtr = NULL;
	PPUAlloc_func(pPtr, cSizeAligned);
	return pPtr;
}

//------------------------------------code paging---------------------------------------------------

//sets the currently active page IDs
//asm inline instructions are there to create the instruction slot for the page id replacement
//this is necessary since the extern int placeholders create lqr instructions and this is on pipe 1 
//	(other than pipe 0 where ila is located at) (do not change the instruction, binary op code is looked up)
/*__attribute__((always_inline))
inline void __spu_set_active_pages(const vec_uchar16 cID0)
{
	SPUSetActivePagesFunc(cID0, spu_maskb((unsigned short)65535), spu_maskb((unsigned short)65535), spu_maskb((unsigned short)65535));
};
*/
__attribute__((always_inline))
inline void __spu_set_active_pages(const vec_uchar16 cID0, const vec_uchar16 cID1)
{
	SPUSetActivePagesFunc(cID0, cID1, spu_maskb((unsigned short)65535), spu_maskb((unsigned short)65535));
};

//sets the currently active page IDs
__attribute__((always_inline))
inline void __spu_set_active_pages(const vec_uchar16 cID0, const vec_uchar16 cID1, const vec_uchar16 cID2)
{
	SPUSetActivePagesFunc(cID0, cID1, cID2, spu_maskb((unsigned short)65535));
};

//sets the currently active page IDs
__attribute__((always_inline))
inline void __spu_set_active_pages(const vec_uchar16 cID0, const vec_uchar16 cID1, const vec_uchar16 cID2, const vec_uchar16 cID3)
{
	SPUSetActivePagesFunc(cID0, cID1, cID2, cID3);
};

//obsolete
#define __spu_set_active_bubbles __spu_set_active_pages

//prefetches the most recent destination page from a non local history table
__attribute__((always_inline))
inline void __spu_set_active_pages_from_hist_table(const NSPU::SFuncHistoryTable* const __restrict cpHistTable)
{
	const vec_uchar16 cDefVal = spu_maskb((unsigned short)65535);
	const vec_uchar16 destPageIDEncoded = spu_maskb(((NPageBin::SCrossPatch*)&cpHistTable->funcData[NSPU::GetLargestIndex(cpHistTable->lru)])->destPageID);
	SPUSetActivePagesFunc(spu_maskb(((NPageBin::SCrossPatch*)&cpHistTable->funcData[0])->sourcePageID), destPageIDEncoded, cDefVal, cDefVal);
};

//init the upcoming cross page call, force an insertion of a lqr instruction into $71
//also increment the return stack
#define __spu_init_cross_call(cCrossData)({\
	g_CrossPageData = (vec_ushort8)cCrossData;})


#define __cache_resolve_global_var_addr(cGlobVarOff)({\
	__asm__ volatile ("ilhu $71,0" :  : );\
	__asm__ volatile ("iohl $71,0" :  : );\
	SPUResolveGlobalVarFunc(cGlobVarOff);})

#define __spu_gen_funcptr_from_id(cFuncID) (NSPU::SPageFuncPtr)SPUGenFuncPtrFromId((unsigned int)cFuncID)

//this makes the parent job not call any callback or set the external job state
//check if there is not already been a registered job
#define SetEnableSPUJobAtParentExit(cNewPPUPushEA)({\
	vec_uint4 *const __restrict pJobSpawnReg = (vec_uint4*)(void*)G_SPU_JOB_SPAWN_REG;\
	assert(spu_extract(*pJobSpawnReg, JOB_SPAWN_STATE_WORD) == 0);\
	*pJobSpawnReg = spu_insert((unsigned int)cNewPPUPushEA, *pJobSpawnReg, JOB_SPAWN_PUSH_WORD);})

#if !defined(_LIB_DRIVER)
	#include <PPU/SPUJobBase.h>
#endif

//------------------------------------------------lib gcm---------------------------------------------

extern int cellGcmAddressToOffset(const void*, unsigned int *__restrict);
#define cellGcmAddressToOffset$VML_ cellGcmAddressToOffset

extern unsigned int cellGcmGetAndResetRSXWaitTicks();
extern unsigned int cellGcmGetAndResetPerfTicks0();
extern unsigned int cellGcmGetAndResetPerfTicks1();
extern unsigned int cellGcmGetAndResetPerfTicks2();
extern unsigned int cellGcmGetAndResetPerfTicks3();
extern void cellGcmAddRSXWaitTicks(const unsigned int, const unsigned int);
extern void cellGcmAddPerfTicks0(const unsigned int);
extern void cellGcmAddPerfTicks1(const unsigned int);
extern void cellGcmAddPerfTicks2(const unsigned int);
extern void cellGcmAddPerfTicks3(const unsigned int);
extern void cellGcmAddRSXStallTicks(const unsigned int, const unsigned int);

extern void cellGcmUpdateGlobalPPUContext();

//do not change tag since it used for syncing flush too
extern void cellGcmSyncUCodeLS();

__attribute__((always_inline))
inline CellGcmSPUData* GetGcmSPUData()
{
	return (CellGcmSPUData*)(void*)*(unsigned int*)G_SPU_GCM_CONTEXT_LOCAL_ADDR;
}

__attribute__((always_inline))
inline unsigned int RSXLocalAddress()
{
	return (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmRsxBaseAddress);
}

__attribute__((always_inline))
inline void SetGcmSPUData(CellGcmSPUData* const __restrict pLocalGcmSPUData)
{
	*(unsigned int*)G_SPU_GCM_CONTEXT_LOCAL_ADDR = (unsigned int)(void*)pLocalGcmSPUData;
}

extern uint8_t* cellGcmGetPSBuf();
extern uint8_t* cellGcmGetVSBuf();
struct CellGcmLocalContextData;
extern void cellGcmInitLocalGcmContext
(
	CellGcmLocalContextData *__restrict*, 
	CellGcmSPUData *__restrict, 
	uint8_t *__restrict, 
	const uint32_t, 
	uint8_t *__restrict,
	const uint32_t,
	uint8_t *__restrict,
	const uint32_t	
);
#define cellGcmInitLocalGcmContext$VLLLLL_ cellGcmInitLocalGcmContext

extern int cellGcmSetFlip(const uint8_t);
extern void cellGcmSetVertexProgram(const uint32_t*, const void * __restrict);
#define cellGcmSetVertexProgram$VLL_ cellGcmSetVertexProgram
struct CellGcmSurface;
extern void cellGcmSetSurfaceWindow(const CellGcmSurface*, const uint32_t, const uint32_t);
#define cellGcmSetSurfaceWindow$VL_ cellGcmSetSurfaceWindow
extern void cellGcmSetVertexDataArray(uint8_t index, uint16_t frequency, uint8_t stride, uint8_t size, uint8_t type, uint8_t location, uint32_t offset);
//extern unsigned long long cellGcmGetTimeStampLocation(const unsigned int index);
extern void cellGcmSetWaitFlip();
extern void cellGcmFlush();
extern uint64 cellGcmSyncToRSX(const uint64, unsigned int&, bool, bool, int);
#define cellGcmSyncToRSX$VL_ cellGcmSyncToRSX

/*__attribute__((always_inline))
inline void cellGcmMemCpy$VLM_(void* const __restrict pLocalDst, const void* const __restrict cpSrc, const unsigned int cSize)
{
	//it works on a 4 byte base and size is 4..128 bytes 
	//from main to local cmd buffer
	assert(cSize >= 4 && cSize <= 128);
	unsigned int *__restrict pDst	= (unsigned int*)pLocalDst;
	//get cache ptr to first elem
	unsigned int copiedBytes = 0;
	const unsigned int* __restrict pFirstCacheDst	= (unsigned int*)__spu_cache_lookup(cpSrc, 1, 128);
	//copy all within that cache line
	do
	{
		*pDst++ = *pFirstCacheDst++;
		copiedBytes += 4;
	}
	WHILE(copiedBytes < cSize && ((unsigned int)pFirstCacheDst & 127) != 0, 1);
	IF(copiedBytes < cSize, 1)
	{
		//get cache ptr to last but one elem (ensures we have the full range)
		const unsigned int* __restrict pNextCacheDst = (unsigned int*)__spu_cache_lookup((unsigned int)cpSrc + copiedBytes, 1, 128);
		do
		{
			*pDst++ = *pNextCacheDst++;
			copiedBytes += 4;
		}
		WHILE(copiedBytes < cSize, 1);
	}
}
*/
extern uint8* cellGcmCpyUCodeLS(void* const __restrict, const unsigned int, const unsigned int);
#define cellGcmCpyUCodeLS$VM_ cellGcmCpyUCodeLS

extern uint8* cellGcmCpyVertexCodeLS(void* const __restrict pMainUCode, const unsigned int cUCodeSize, const unsigned int cOff);
#define cellGcmCpyVertexCodeLS$VM_ cellGcmCpyVertexCodeLS

extern void cellGcmCpySyncVertexCode();
//syncs ucode transfer back to main and command buffer transfer
#define cellGcmSyncTransferToMain()	SyncMemory(g_scDMAListTag)

extern void cellGcmCpyUCodeMain(void* const __restrict, const void* const __restrict, const unsigned int);
extern void cellGcmCpyUCodeMainFromLS(void* const __restrict, const void* const __restrict, const unsigned int);
#define cellGcmCpyUCodeMain$VMM_ cellGcmCpyUCodeMain
#define cellGcmCpyUCodeMainFromLS$VML_ cellGcmCpyUCodeMainFromLS

#define cellGcmMemCpy$VLL_(pLocalDst, cpLocalSrc, cSize)({\
	unsigned int *__restrict pDst	= (unsigned int*)(pLocalDst);\
	const unsigned int *__restrict pSrc	= (unsigned int*)(cpLocalSrc);\
	unsigned int copiedBytes = 0;\
	do\
	{\
		*pDst++ = *pSrc++;\
		copiedBytes += 4;\
	}\
	WHILE(copiedBytes < (cSize), 1);})

#define cellGcmMemCpy cellGcmMemCpy$VLL_

//-------------------------------------------------spu function profiling-----------------------------------------------

struct __spu_func_prof_helper
{
	unsigned int start;
	ILINE void Start()
	{
		if(G_SPU_FUNC_PROF_SPU_AREA)
		{
//			asm volatile("nop");
			start = rdtsc();
		}
	}
	ILINE void Stop(unsigned int index) const
	{
		if(G_SPU_FUNC_PROF_SPU_AREA)
		{
//			asm volatile("nop");
			uint32 *const __restrict pTimingArea = &((unsigned int*)((NSPU::SFuncProfSPUTiming*)G_SPU_FUNC_PROF_SPU_AREA)->funcProfSPUTimingAreaBss)[index*2];
			*pTimingArea += start - rdtsc();
			pTimingArea[1]++;
		}
	}
};

struct __spu_func_prof_helper_self
{
	unsigned int start;
	unsigned int index;
	ILINE __spu_func_prof_helper_self(unsigned int ind)
	{
		if(G_SPU_FUNC_PROF_SPU_AREA)
		{
//			asm volatile("nop");
			start = rdtsc();
			index = ind;
		}
	}
	ILINE ~__spu_func_prof_helper_self()
	{
		if(G_SPU_FUNC_PROF_SPU_AREA)
		{
//			asm volatile("nop");
			uint32 *const __restrict pTimingArea = &((unsigned int*)((NSPU::SFuncProfSPUTiming*)G_SPU_FUNC_PROF_SPU_AREA)->funcProfSPUTimingAreaBss)[index*2];
			*pTimingArea += start - rdtsc();
			pTimingArea[1]++;
		}
	}
};

#undef SPU_PROFILER_SECTION
#define SPU_PROFILER_SECTION(X)	__spu_func_prof_helper_self __funcProfHelper##X (X)

#undef SPU_PROFILER_SECTION_INIT
#define SPU_PROFILER_SECTION_INIT(X) __spu_func_prof_helper __funcProfHelper##X

#undef SPU_PROFILER_SECTION_BEGIN
#define SPU_PROFILER_SECTION_BEGIN(X) (__funcProfHelper##X).Start()

#undef SPU_PROFILER_SECTION_END
#define SPU_PROFILER_SECTION_END(X) (__funcProfHelper##X).Stop(X)

__attribute__((always_inline))
inline void __spu_transfer_func_prof_stats()
{
	if((uint32)SPUTransferFuncProfStatsFunc)
		SPUTransferFuncProfStatsFunc();
}

//-------------------------------------------------raw spu sntuner integration-----------------------------------------------
/*
__attribute__ ((noinline))
static void __spu_prof_start(int level)
{
	typedef union { char c4[4]; uint16_t u16[2]; uint32_t u32; } Module_u;
	const Module_u s_mu = { { 'P', 'h', 'y', 's' } };
	__spu_insert_bookmark( 0xffaa );	// start marker 1
	__spu_insert_bookmark( s_mu.u16[0] );	// name
	__spu_insert_bookmark( s_mu.u16[1] );	// name
	__spu_insert_bookmark( level );	// level
	__spu_insert_bookmark( __spu_get_cur_page_addr() >> 2);		// LSA
	__spu_insert_bookmark( 0xffab );	// start marker 2
__spu_insert_bookmark( 0x3E0 );
}

__attribute__ ((noinline))
static void __spu_prof_stop()
{
	typedef union { uint16_t u16[4]; uint32_t u32[2]; uint64_t u64; } GUID_u;
	GUID_u guid;
	qword cPageGUID = (qword)si_lqd((qword)spu_promote(__spu_get_cur_page_addr(),0), 256);
	qword insn = si_roti(cPageGUID, 7);
	qword pattern = (qword)(vec_uchar16){0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13};
	guid.u64 = si_to_ullong(si_shufb(insn, insn, pattern));

	__spu_insert_bookmark( 0xffac );	// start marker 1
	__spu_insert_bookmark( guid.u16[0] );	// guid
	__spu_insert_bookmark( guid.u16[1] );	// guid
	__spu_insert_bookmark( guid.u16[2] );	// guid
	__spu_insert_bookmark( guid.u16[3] );	// guid
	__spu_insert_bookmark( 0xffad );	// start marker 2
__spu_insert_bookmark( 0x3E0 );
}

class CSPUProfilerHelper
{
public:
	ILINE CSPUProfilerHelper(int level) {__spu_prof_start(level);}
	ILINE ~CSPUProfilerHelper(){__spu_prof_stop();}
};
#undef SPU_POFILER_SECTION
#define SPU_POFILER_SECTION CSPUProfilerHelper helper(3);
#undef SPU_DEBUG_BREAK
#define SPU_DEBUG_BREAK IF(IsDebugEnabled(),0) __asm volatile ("stop 255");CSPUProfilerHelper helper(2);
*/
//-------------------------------------------------misc-----------------------------------------------
#define __spu_toggle_ppu_callback$VMM_ __spu_toggle_ppu_callback

__attribute__((always_inline))
inline void __spu_toggle_ppu_callback(void* pFunc, void* pArg)
{
	CustomCallbackHandler(pFunc, pArg);
}

#define __spu_exec_ppu_call(cArg, cOpCode) SPUExecPPUCall((unsigned int)cArg, (unsigned int)cOpCode)

#define __spu_cleanup_memory() CleanupMemory()

#define __spu_dump_prof_stats()	SPUProfDataHandlerFunc()

#define __spu_reset_prof_stats()({\
	vec_uint4 *pSPUCacheProfIDCounter = (vec_uint4*)(void*)G_SPU_CACHE_PROF_ID_COUNTER_ADDR;\
	for(unsigned int i=0; i<MAX_PROF_ID * (4*3) / sizeof(vec_uint4); ++i)\
		pSPUCacheProfIDCounter[i] = (vec_uint4){0};})

	//mark job as finished, must not be called again however, only for non returning jobs
#if defined(SUPP_SN)
#define __spu_unregister_job()\
	MemcpyMain(*(unsigned int*)G_SPU_INFO_PACKET_ADDR, (void*)G_SPU_ZERO16, 16, g_scDMAOutputTag);\
	((spu_mod_hdr*)0)->pad = SPUThreadState
#else
	#define __spu_unregister_job() MemcpyMain(*(unsigned int*)G_SPU_INFO_PACKET_ADDR, (void*)G_SPU_ZERO16, 16, g_scDMAOutputTag)
#endif//SUPP_SN

#if defined(SUPP_SN)
	__attribute__((always_inline))
	inline const int IsDebugEnabled()
	{
		return *(int*)G_SPU_DEBUG_STATE;
	}
#endif

__attribute__((always_inline))
inline void __spu_set_dabr(void* cLSAddr)
{
	if(G_SPU_DABR_ADDR)
	{
		SDABR *const pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;
		pDABR->lsAddr = (unsigned int)cLSAddr;
		pDABR->oldVal = *(unsigned int*)(void*)cLSAddr;
	}
}

__attribute__((always_inline))
inline void __spu_set_dabr_ppu(const unsigned int cEA)
{
	if(G_SPU_DABR_ADDR)
	{
		SDABR *const pDABR	= (SDABR*)(void*)G_SPU_DABR_ADDR;
		pDABR->ppuEA				= cEA & ~127;
	}
}

__attribute__((always_inline))
inline void __spu_set_dabr_add_data(const unsigned int cAddData)
{
	if(G_SPU_DABR_ADDR)
	{
		SDABR *const pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;
		pDABR->addData		 = cAddData;
	}
}

__attribute__((always_inline))
inline const unsigned int __spu_get_dabr_add_data()
{
	if(G_SPU_DABR_ADDR)
	{
		SDABR *const pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;
		return pDABR->addData;
	}
}

#define __spu_check_dabr()\
	if(G_SPU_DABR_ADDR)\
	{\
		SDABR *const __restrict pDABR = (SDABR*)(void*)G_SPU_DABR_ADDR;\
		const unsigned int cCurDABRVal = *((unsigned int*)(void*)pDABR->lsAddr);\
		if(pDABR->oldVal != cCurDABRVal)\
		{\
			snPause();\
			pDABR->oldVal = cCurDABRVal;\
		}\
	}

#define __spu_cache_barrier()

__attribute__((always_inline))
inline ILog* GetISPULog(){return (ILog*)G_SPU_LOG;}

__attribute__((always_inline))
inline void LogMessageV(const char* szFormat, va_list args)
{
	SPULogMessageV(szFormat, args);
}

#else //_SPU_JOB
	inline void __spu_dma_pref(const unsigned int){}
#endif //_SPU_JOB

#if defined(SUPP_SN)
	#undef SPU_DEBUG_BREAK
	#define SPU_DEBUG_BREAK if(IsDebugEnabled()) __asm volatile ("stop 255")
#else
	#define SPU_DEBUG_BREAK
#endif

#if !defined(_SPU_JOB)
	namespace NSPU
	{
		namespace NDriver
		{
			extern unsigned int g_SPUId;
			extern unsigned int g_DestStatsAreaEA;
		}
	}
#endif

__attribute__((always_inline))
inline unsigned int __spu_get_current_id()
{
#if defined(_SPU_JOB)
	return *(unsigned int*)G_SPU_ID_ADDR;
#else
	return NSPU::NDriver::g_SPUId;
#endif
};

__attribute__((always_inline))
inline void __spu_transfer_frame_stats(const unsigned int cTickCount)
{
#if defined(SUPP_SPU_FRAME_STATS)
	#if defined(_SPU_JOB)
		const unsigned int cDestAreaEA = *(unsigned int*)G_FRAME_STATS_DEST_AREA;
		SPUSyncAtomicDCache();
	#else
		const unsigned int cDestAreaEA = NSPU::NDriver::g_DestStatsAreaEA;
	#endif
	int status;
	//update single spu stats
	NPPU::SSingleSPUStat curStats _ALIGN(128);
	const unsigned int cSPUID = __spu_get_current_id();
	mfc_prep((void*)&curStats, cDestAreaEA);
	do 
	{
		//get lock for queue
		mfc_getllar_again();
		mfc_read_atomic_status();
//		const unsigned int cCurStatIndex	= (cSPUID >= curStats.curSPUPivot)?(cSPUID - curStats.curSPUPivot) : (cMaxSPUs - curStats.curSPUPivot + cSPUID);
		const unsigned int cCurStatIndex	= cSPUID;
		curStats.count[cCurStatIndex] += cTickCount;
		mfc_putllc_again();
		status = mfc_read_atomic_status();
	}while(status != 0);
#endif
}

//threading API support funcs
namespace std
{
	extern "C" 
	__attribute__((noreturn))
	__attribute__((always_inline))
	inline void _exit()
	{
		printf("abort()\n");
		__asm volatile ("stop 255");
		while(true){};
	}
}  

#endif //__SPU__
#endif //PS3
#endif //__SPU_JOB_H
