/*
	relocatable and on demand linkable implementation for required software floating point ops
*/ 

#if defined(PS3)
#if defined(__SPU__)

#include "../SPUUtilities.h"

#define int8 int
#define flag int
#define uint8 int
#define int16 int
#define float32 unsigned int

typedef unsigned char bits8;
typedef signed char sbits8;
typedef unsigned short int bits16;
typedef signed short int sbits16;
typedef unsigned int bits32;
typedef signed int sbits32;
typedef uint64 bits64;
typedef int64 sbits64;

typedef struct 
{
	unsigned int high, low;
} float64;

typedef union
{
	float64 fl64;
	bits64 b64;
}Conv64;

#define extractFloat64Sign(a)  (flag)(a.high>>31)
#define extractFloat64Frac1(a) (bits32)(a.low)
#define extractFloat64Frac0(a) (bits32)(a.high & 0x000FFFFF)
#define extractFloat64Exp(a)   (int16)(( a.high>>20 ) & 0x7FF)
#define extractFloat64Frac(a)(bits64)((a)&0x000FFFFFFFFFFFFFULL)
#define extractFloat32Frac(a)(bits32)((a) & 0x007FFFFF)
#define extractFloat32Exp(a)(int16)(((a)>>23)&0xFF)
#define extractFloat32Sign(a)(flag)((a)>>31)
#define normalizeFloat32Subnormal(aSig, zExpPtr, zSigPtr)({\
	int8 shiftCount = countLeadingZeros32(aSig) - 8;\
	*(zSigPtr) = (aSig)<<shiftCount;\
	*(zExpPtr) = 1 - shiftCount;})

#define le64(a0, a1, b0, b1)(flag)(((a0)<(b0))||(((a0)==(b0))&&((a1)<=(b1))))
#define lt64(a0, a1, b0, b1)(flag)(((a0)<(b0))||(((a0)==(b0))&&((a1)< (b1))))

#define shortShift64Left(a0, a1, count, z0Ptr, z1Ptr)({\
	*(z1Ptr)	= (a1)<<(count);\
	*(z0Ptr)	= ((count) == 0)?(a0) : ((a0)<<count) | ((a1)>>((-(count))&31));})
#define shift64Right(a0, a1, count, z0Ptr, z1Ptr)({\
	bits32 z0, z1;\
	int8 negCount = (-(count)) & 31;\
	if((count) == 0)\
	{\
		(z1) = (a1);\
		(z0) = (a0);\
	}\
	else \
	if((count) < 32)\
	{\
		(z1) = ((a0)<<negCount) | ((a1)>>(count));\
		(z0) = (a0)>>(count);\
	}\
	else\
	{\
		(z1) = ((count) < 64)?((a0)>>((count) & 31)):0;\
		(z0) = 0;\
	}\
	*(z1Ptr) = (z1);\
	*(z0Ptr) = (z0);})

#define sub96(a0,a1,a2,b0,b1,b2,z0Ptr,z1Ptr,z2Ptr)({\
	bits32 z0, z1, z2;\
	int8 borrow0, borrow1;\
	z2 = (a2) - (b2);\
	borrow1 = ( (a2) < (b2) );\
	z1 = (a1) - (b1);\
	borrow0 = ( (a1) < (b1) );\
	z0 = (a0) - (b0);\
	z0 -= ( z1 < borrow1 );\
	z1 -= borrow1;\
	z0 -= borrow0;\
	*(z2Ptr) = z2;\
	*(z1Ptr) = z1;\
	*(z0Ptr) = z0;})

#define add64(a0, a1, b0, b1, z0Ptr, z1Ptr)({\
	*(z1Ptr) = (a1) + (b1);\
	*(z0Ptr) = (a0) + (b0) + (((a1) + (b1))<(a1));})

#define sub64(a0, a1, b0, b1, z0Ptr, z1Ptr)({\
	*(z1Ptr) = (a1) - (b1);\
	*(z0Ptr) = (a0) - (b0) - ((a1)<(b1));})

#define packFloat64(zSign, zExp, zSig0, zSig1)\
	(float64){(zSig1),(((bits32)(zSign))<<31)+(((bits32)(zExp))<<20)+(zSig0)}

#define packFloat64From64(zSign, zExp, zSig) ((((bits64)(zSign))<<63)+(((bits64)(zExp))<<52) + (zSig))

#define normalizeFloat64Subnormal(aSig0,aSig1,zExpPtr,zSig0Ptr,zSig1Ptr)({\
	int8 shiftCount;\
	if((aSig0) == 0)\
	{\
		shiftCount = countLeadingZeros32(aSig1) - 11;\
		if(shiftCount < 0 )\
		{\
			*(zSig0Ptr) = (aSig1)>>(-shiftCount);\
			*(zSig1Ptr) = (aSig1)<<(shiftCount & 31);\
		}\
		else\
		{\
			*(zSig0Ptr) = (aSig1)<<shiftCount;\
			*(zSig1Ptr) = 0;\
		}\
		*(zExpPtr) = -shiftCount - 31;\
	}\
	else\
	{\
		shiftCount = countLeadingZeros32((aSig0)) - 11;\
		shortShift64Left(aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr);\
		*(zExpPtr) = 1 - shiftCount;\
	}})

#define mul32To64(a, b, z0Ptr, z1Ptr)({\
	bits16 aHigh, aLow, bHigh, bLow;\
	bits32 z0, zMiddleA, zMiddleB, z1;\
	aLow = (a);\
	aHigh = (a)>>16;\
	bLow = (b);\
	bHigh = (b)>>16;\
	z1 = ( (bits32) aLow ) * bLow;\
	zMiddleA = ( (bits32) aLow ) * bHigh;\
	zMiddleB = ( (bits32) aHigh ) * bLow;\
	z0 = ( (bits32) aHigh ) * bHigh;\
	zMiddleA += zMiddleB;\
	z0 += ( ( (bits32) ( zMiddleA < zMiddleB ) )<<16 ) + ( zMiddleA>>16 );\
	zMiddleA <<= 16;\
	z1 += zMiddleA;\
	z0 += ( z1 < zMiddleA );\
	*(z1Ptr) = z1;\
	*(z0Ptr) = z0;})

#define mul64To128(a0,a1,b0,b1,z0Ptr,z1Ptr,z2Ptr,z3Ptr)({\
		bits32 z0, z1, z2, z3,more1, more2;\
		mul32To64( (a1), (b1), &z2, &z3 );\
		mul32To64( (a1), (b0), &z1, &more2 );\
		add64( z1, more2, 0, z2, &z1, &z2 );\
		mul32To64( (a0), (b0), &z0, &more1 );\
		add64( z0, more1, 0, z1, &z0, &z1 );\
		mul32To64( (a0), (b1), &more1, &more2 );\
		add64( more1, more2, 0, z2, &more1, &z2 );\
		add64( z0, z1, 0, more1, &z0, &z1 );\
		*(z3Ptr) = z3;\
		*(z2Ptr) = z2;\
		*(z1Ptr) = z1;\
		*(z0Ptr) = z0;})

#define add96(a0,a1,a2,b0,b1,b2,z0Ptr,z1Ptr,z2Ptr)({\
	bits32 z0, z1, z2;\
	int8 carry0, carry1;\
	z2 = (a2) + (b2);\
	carry1 = ( z2 < (a2) );\
	z1 = (a1) + (b1);\
	carry0 = ( z1 < (a1) );\
	z0 = (a0) + (b0);\
	z1 += carry1;\
	z0 += ( z1 < carry1 );\
	z0 += carry0;\
	*(z2Ptr) = z2;\
	*(z1Ptr) = z1;\
	*(z0Ptr) = z0;})

#define mul64By32To96(a0,a1,b,z0Ptr,z1Ptr,z2Ptr)({\
	bits32 z0, z1, z2, more1;\
	mul32To64((a1), (b), &z1, &z2);\
	mul32To64((a0), (b), &z0, &more1);\
	add64(z0, more1, 0, z1, &z0, &z1);\
	*(z2Ptr) = z2;\
	*(z1Ptr) = z1;\
	*(z0Ptr) = z0;})

static inline bits32 estimateDiv64To32(bits32 a0, bits32 a1, bits32 b)
{
	bits32 b0, b1;
	bits32 rem0, rem1, term0, term1;
	bits32 z;

	IF(b <= a0,0)
		return 0xFFFFFFFF;
	b0 = b>>16;
	z = ( b0<<16 <= a0 ) ? 0xFFFF0000 : ( a0 / b0 )<<16;
	mul32To64( b, z, &term0, &term1 );
	sub64(a0, a1, term0, term1, &rem0, &rem1);
	while(((sbits32)rem0) < 0) 
	{
		z -= 0x10000;
		b1 = b<<16;
		add64( rem0, rem1, b0, b1, &rem0, &rem1 );
	}
	rem0 = ( rem0<<16 ) | ( rem1>>16 );
	z |= ( b0<<16 <= rem0 ) ? 0xFFFF : rem0 / b0;
	return z;
}

ILINE void shift64ExtraRightJamming(bits32 a0,bits32 a1,bits32 a2,int16 count,bits32 *z0Ptr,bits32 *z1Ptr,bits32 *z2Ptr)
{
	bits32 z0, z1, z2;
	int8 negCount = (-count) & 31;
	if(count == 0) 
	{
		z2 = a2;
		z1 = a1;
		z0 = a0;
	}
	else 
	{
		if ( count < 32 ) 
		{
			z2 = a1<<negCount;
			z1 = (a0<<negCount) | (a1>>count);
			z0 = a0>>count;
		}
		else 
		{
			if(count == 32) 
			{
				z2 = a1;
				z1 = a0;
			}
			else 
			{
				a2 |= a1;
				if(count < 64) 
				{
					z2 = a0<<negCount;
					z1 = a0>>(count & 31);
				}
				else 
				{
					z2 = (count == 64) ? a0 : (a0 != 0);
					z1 = 0;
				}
			}
			z0 = 0;
		}
		z2 |= (a2 != 0);
	}
	*z2Ptr = z2;
	*z1Ptr = z1;
	*z0Ptr = z0;
}

ILINE void shift64RightJamming64(bits64 a, int16 count, bits64 *zPtr)
{
	bits64 z;
	IF(count == 0,0)
		z = a;
	else 
	if(count < 64) 
		z = (a>>count) | ((a<<((-count) & 63)) != 0);
	else
		z = (a != 0);
	*zPtr = z;
}

ILINE void shift64RightJamming(bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr)
{
	bits32 z0, z1;
	int8 negCount = (-count) & 31;
	if(count == 0)
	{
		z1 = a1;
		z0 = a0;
	}
	else 
	if(count < 32)
	{
		z1 = (a0<<negCount) | (a1>>count) | ((a1<<negCount) != 0);
		z0 = a0>>count;
	}
	else 
	{
		if (count == 32) 
			z1 = a0 | (a1 != 0);
		else 
		if (count < 64)
			z1 = (a0>>(count & 31)) | (((a0<<negCount) | a1) != 0);
		else
			z1 = ((a0 | a1) != 0);
		z0 = 0;
	}
	*z1Ptr = z1;
	*z0Ptr = z0;
}

#define countLeadingZeros32(a) spu_extract(spu_cntlz(spu_promote(a, 0)), 0)

static ILINE int8 countLeadingZeros64(uint64 a)
{
	const uint32 cZerosLower = 	spu_extract(spu_cntlz((vec_uint4)si_rotqbyi((qword)spu_promote(a,0),4)), 0);
	const uint32 cZerosUpper = 	spu_extract(spu_cntlz((vec_uint4)spu_promote(a, 0)), 0);
	return (cZerosUpper==32)?(cZerosUpper+cZerosLower):cZerosUpper;
}

#define packFloat32(zSign, zExp, zSig) (float32)(((bits32)(zSign))<<31)+(((bits32)(zExp))<<23)+(zSig)

static ILINE float64 roundAndPackFloat64(flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2)
{
	if((sbits32) zSig2 < 0)
	{
		add64(zSig0, zSig1, 0, 1, &zSig0, &zSig1);
		zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ));
	}
	else 
	if((zSig0 | zSig1) == 0)
		zExp = 0;
	return packFloat64( zSign, zExp, zSig0, zSig1 );
}

static ILINE float64 roundAndPackFloat64(flag zSign, int16 zExp, bits64 zSig)
{
	int16 roundIncrement, roundBits;
	roundIncrement = 0x200;
	roundBits = zSig & 0x3FF;
	zSig = ( zSig + roundIncrement )>>10;
	zSig &= ~(((roundBits ^ 0x200 ) == 0));
	if(zSig == 0) 
		zExp = 0;
	Conv64 u64;
	u64.b64 = packFloat64From64(zSign, zExp, zSig);
	return u64.fl64;
}

static ILINE float32 roundAndPackFloat32(flag zSign, int16 zExp, bits32 zSig)
{
	int8 roundIncrement, roundBits;
	roundIncrement = 0x40;
	roundBits = zSig & 0x7F;
	zSig = (zSig + roundIncrement)>>7;
	zSig &= ~ (((roundBits ^ 0x40) == 0));
	if(zSig == 0)
		zExp = 0;
	return packFloat32( zSign, zExp, zSig );
}

ILINE float64 normalizeRoundAndPackFloat64(flag zSign, int16 zExp, bits64 zSig)
{
	int8 shiftCount = countLeadingZeros64(zSig) - 1;
	return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount);
}

ILINE float64 normalizeRoundAndPackFloat64(flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1)
{
	int8 shiftCount;
	bits32 zSig2;
	if ( zSig0 == 0 ) 
	{
		zSig0 = zSig1;
		zSig1 = 0;
		zExp -= 32;
	}
	shiftCount = countLeadingZeros32(zSig0) - 11;
	if(0 <= shiftCount) 
	{
		zSig2 = 0;
		shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
	}
	else 
		shift64ExtraRightJamming(zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2);
	zExp -= shiftCount;
	return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
}

static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
{
	int16 aExp, bExp, zExp;
	bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
	int16 expDiff;
	float64 z;

	aSig1 = extractFloat64Frac1( a );
	aSig0 = extractFloat64Frac0( a );
	aExp = extractFloat64Exp( a );
	bSig1 = extractFloat64Frac1( b );
	bSig0 = extractFloat64Frac0( b );
	bExp = extractFloat64Exp( b );
	expDiff = aExp - bExp;
	shortShift64Left(aSig0, aSig1, 10, &aSig0, &aSig1);
	shortShift64Left(bSig0, bSig1, 10, &bSig0, &bSig1);
	if(0 < expDiff) 
		goto aExpBigger;
	if(expDiff < 0) 
		goto bExpBigger;
	if(aExp == 0) 
	{
		aExp = 1;
		bExp = 1;
	}
	if(bSig0 < aSig0) 
		goto aBigger;
	if(aSig0 < bSig0) 
		goto bBigger;
	if(bSig1 < aSig1) 
		goto aBigger;
	if(aSig1 < bSig1) 
		goto bBigger;
	return packFloat64(false, 0, 0, 0);
bExpBigger:
	if ( aExp == 0 )
		++expDiff;
	else
		aSig0 |= 0x40000000;
	shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
	bSig0 |= 0x40000000;
bBigger:
	sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
	zExp = bExp;
	zSign ^= 1;
	goto normalizeRoundAndPack;
aExpBigger:
	IF(aExp == 0x7FF,0)
		return a;
	if (bExp == 0)
		--expDiff;
	else
		bSig0 |= 0x40000000;
	shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
	aSig0 |= 0x40000000;
aBigger:
	sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
	zExp = aExp;
normalizeRoundAndPack:
	--zExp;
	return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
}

static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
{
	int16 aExp, bExp, zExp;
	bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
	int16 expDiff;

	aSig1 = extractFloat64Frac1( a );
	aSig0 = extractFloat64Frac0( a );
	aExp = extractFloat64Exp( a );
	bSig1 = extractFloat64Frac1( b );
	bSig0 = extractFloat64Frac0( b );
	bExp = extractFloat64Exp( b );
	expDiff = aExp - bExp;
	if(0 < expDiff) 
	{
		IF(aExp == 0x7FF,0)
			return a;
		if (bExp == 0)
			--expDiff;
		else
			bSig0 |= 0x00100000;
		shift64ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
		zExp = aExp;
	}
	else 
	if(expDiff < 0)
	{
		IF(bExp == 0x7FF,0)
			return packFloat64(zSign, 0x7FF, 0, 0);
		if (aExp == 0)
			++expDiff;
		else 
			aSig0 |= 0x00100000;
		shift64ExtraRightJamming(aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
		zExp = bExp;
	}
	else 
	{
		IF(aExp == 0x7FF,0)
			return a;
		add64(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
		if (aExp == 0)
			return packFloat64(zSign, 0, zSig0, zSig1);
		zSig2 = 0;
		zSig0 |= 0x00200000;
		zExp = aExp;
		goto shiftRight1;
	}
	aSig0 |= 0x00100000;
	add64(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
	--zExp;
	if(zSig0 < 0x00200000)
		goto roundAndPack;
	++zExp;
shiftRight1:
	shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
roundAndPack:
	return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
}

#endif //__SPU__
#endif //PS3
