#include "StdAfx.h"
#include "SimpleDXTCompressor.h"







// source code extracted from
// http://developer.nvidia.com/object/texture_tools.html
//
//
// The NVIDIA Texture Tools 2 are licensed under the MIT license.
// 
// Copyright (c) 2007 NVIDIA Corporation
// 
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
// 
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.


// Type definitions
typedef unsigned char		uint8;
typedef signed char			int8;

typedef unsigned short		uint16;
typedef signed short		int16;

typedef unsigned int		uint32;
typedef signed int			int32;

typedef unsigned __int64	uint64;
typedef signed __int64		int64;

// Aliases
typedef uint32				uint;







/// 32 bit color stored as BGRA.
class Color32
{
public:
	Color32() { }
	Color32(const Color32 & c) : u(c.u) { }
	Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
	Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
	//Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
	//Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
	//Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
	explicit Color32(uint32 U) : u(U) { }

	void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
	{
		r = R;
		g = G;
		b = B;
		a = A;
	}

	void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
	{
		r = R;
		g = G;
		b = B;
		a = A;
	}

	operator uint32 () const {
		return u;
	}

	union {
		struct {
#if NV_LITTLE_ENDIAN
			uint8 b, g, r, a;
#else
			uint8 a: 8;
			uint8 r: 8;
			uint8 g: 8;
			uint8 b: 8;
#endif
		};
		uint32 u;
	};
};



/// 16 bit 565 BGR color.
class Color16
{
public:
	Color16() { }
	Color16(const Color16 & c) : u(c.u) { }
	explicit Color16(uint16 U) : u(U) { }

	union {
		struct {
#if NV_LITTLE_ENDIAN
			uint16 b : 5;
			uint16 g : 6;
			uint16 r : 5;
#else
			uint16 r : 5;
			uint16 g : 6;
			uint16 b : 5;
#endif
		};
		uint16 u;
	};
};

/// Convert Color32 to Color16.
inline Color16 toColor16(Color32 c)
{
	Color16 color;
	//         rrrrrggggggbbbbb
	// rrrrr000gggggg00bbbbb000
	//	color.u = (c.u >> 3) & 0x1F;
	//	color.u |= (c.u >> 5) & 0x7E0;
	//	color.u |= (c.u >> 8) & 0xF800;

	color.r = c.r >> 3;
	color.g = c.g >> 2;
	color.b = c.b >> 3;
	return color; 
}


typedef Color32 VectorColor;


struct ColorBlock
{
	ColorBlock();
	ColorBlock(const uint * linearImage);
	ColorBlock(const ColorBlock & block);
	//	ColorBlock(const Image * img, uint x, uint y);

	//	void init(const Image * img, uint x, uint y);

//	void swizzleDXT5n();
//	void splatX();
//	void splatY();

//	uint countUniqueColors() const;
//	Color32 averageColor() const;

//	void diameterRange(Color32 * start, Color32 * end) const;
//	void luminanceRange(Color32 * start, Color32 * end) const;
	void boundsRange(Color32 * start, Color32 * end) const;
//	void boundsRangeAlpha(Color32 * start, Color32 * end) const;
//	void bestFitRange(Color32 * start, Color32 * end) const;

//	void sortColorsByAbsoluteValue();

	//	void computeRange(const Vector3 & axis, Color32 * start, Color32 * end) const;
	//	void sortColors(const Vector3 & axis);

	//	Line3 bestFitLine() const;
//	float volume() const;
	//	Line3 diameterLine() const;

	// Accessors
	const Color32 * colors() const;

	Color32 color(uint i) const;
	Color32 & color(uint i);

	Color32 color(uint x, uint y) const;
	Color32 & color(uint x, uint y);

private:

	Color32 m_color[4*4];

};




/// Init the color block from an array of colors.
ColorBlock::ColorBlock(const uint * linearImage)
{
	for(uint i = 0; i < 16; i++) {
		color(i) = Color32(linearImage[i]);
	}
}


/// Get block color.
inline Color32 ColorBlock::color(uint i) const
{
	//	nvDebugCheck(i < 16);
	return m_color[i];
}


/// Get color range based on the bounding box. 
void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
{
//	nvDebugCheck(start != NULL);
//	nvDebugCheck(end != NULL);

	Color32 minColor(255, 255, 255);
	Color32 maxColor(0, 0, 0);

	for(uint i = 0; i < 16; i++)
	{
		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
	}

	// Offset range by 1/16 of the extents
	Color32 inset;
	inset.r = (maxColor.r - minColor.r) >> 4;
	inset.g = (maxColor.g - minColor.g) >> 4;
	inset.b = (maxColor.b - minColor.b) >> 4;

	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;

	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;

	*start = minColor;
	*end = maxColor;
}


/// DXT1 block.
struct BlockDXT1
{
	Color16 col0;
	Color16 col1;
	union {
		uint8 row[4];
		uint indices;
	};

	bool isFourColorMode() const;

	uint evaluatePalette(Color32 color_array[4]) const;
	uint evaluatePaletteFast(Color32 color_array[4]) const;
	void evaluatePalette3(Color32 color_array[4]) const;
	void evaluatePalette4(Color32 color_array[4]) const;

	void decodeBlock(ColorBlock * block) const;

	void setIndices(int * idx);

	void flip4();
	void flip2();
};



// Evaluate palette assuming 4 color block.
void BlockDXT1::evaluatePalette4(Color32 color_array[4]) const
{
	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
	color_array[0].a = 0xFF;

	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
	color_array[1].a = 0xFF;

	// Four-color block: derive the other two colors.
	color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
	color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
	color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
	color_array[2].a = 0xFF;

	color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
	color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
	color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
	color_array[3].a = 0xFF;
}



/// DXT5 alpha block.
struct AlphaBlockDXT5
{
	union {
		struct {
			uint64 alpha0 : 8;	// 8
			uint64 alpha1 : 8;	// 16
			uint64 bits0 : 3;	// 3 - 19
			uint64 bits1 : 3; 	// 6 - 22
			uint64 bits2 : 3; 	// 9 - 25
			uint64 bits3 : 3;	// 12 - 28
			uint64 bits4 : 3;	// 15 - 31
			uint64 bits5 : 3;	// 18 - 34
			uint64 bits6 : 3;	// 21 - 37
			uint64 bits7 : 3;	// 24 - 40
			uint64 bits8 : 3;	// 27 - 43
			uint64 bits9 : 3; 	// 30 - 46
			uint64 bitsA : 3; 	// 33 - 49
			uint64 bitsB : 3;	// 36 - 52
			uint64 bitsC : 3;	// 39 - 55
			uint64 bitsD : 3;	// 42 - 58
			uint64 bitsE : 3;	// 45 - 61
			uint64 bitsF : 3;	// 48 - 64
		};
		uint64 u;
	};

	void evaluatePalette(uint8 alpha[8]) const;
	void evaluatePalette8(uint8 alpha[8]) const;
	void evaluatePalette6(uint8 alpha[8]) const;
	void indices(uint8 index_array[16]) const;

	uint index(uint index) const;
	void setIndex(uint index, uint value);

	void decodeBlock(ColorBlock * block) const;

	void flip4();
	void flip2();
};


/// DXT5 block.
struct BlockDXT5
{
	AlphaBlockDXT5 alpha;
	BlockDXT1 color;

	void decodeBlock(ColorBlock * block) const;

	void flip4();
	void flip2();
};


uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
{
	// Does bit expansion before interpolation.
	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
	color_array[0].a = 0xFF;

	// @@ Same as above, but faster?
	//	Color32 c;
	//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
	//	c.u |= (c.u >> 5) & 0x070007;
	//	c.u |= (c.u >> 6) & 0x000300;
	//	color_array[0].u = c.u;

	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
	color_array[1].a = 0xFF;

	// @@ Same as above, but faster?
	//	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
	//	c.u |= (c.u >> 5) & 0x070007;
	//	c.u |= (c.u >> 6) & 0x000300;
	//	color_array[1].u = c.u;

	if( col0.u > col1.u ) {
		// Four-color block: derive the other two colors.
		color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
		color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
		color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
		color_array[2].a = 0xFF;

		color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
		color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
		color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
		color_array[3].a = 0xFF;

		return 4;
	}
	else {
		// Three-color block: derive the other color.
		color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
		color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
		color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
		color_array[2].a = 0xFF;

		// Set all components to 0 to match DXT specs.
		color_array[3].r = 0x00; // color_array[2].r;
		color_array[3].g = 0x00; // color_array[2].g;
		color_array[3].b = 0x00; // color_array[2].b;
		color_array[3].a = 0x00;

		return 3;
	}
}



void AlphaBlockDXT5::decodeBlock(ColorBlock * block) const
{
//	nvDebugCheck(block != NULL);

	uint8 alpha_array[8];
	evaluatePalette(alpha_array);

	uint8 index_array[16];
	indices(index_array);

	for(uint i = 0; i < 16; i++) {
		block->color(i).a = alpha_array[index_array[i]];
	}
}

/// Get block color.
inline Color32 & ColorBlock::color(uint i)
{
//	nvDebugCheck(i < 16);
	return m_color[i];
}


void AlphaBlockDXT5::indices(uint8 index_array[16]) const
{
	index_array[0x0] = bits0;
	index_array[0x1] = bits1;
	index_array[0x2] = bits2;
	index_array[0x3] = bits3;
	index_array[0x4] = bits4;
	index_array[0x5] = bits5;
	index_array[0x6] = bits6;
	index_array[0x7] = bits7;
	index_array[0x8] = bits8;
	index_array[0x9] = bits9;
	index_array[0xA] = bitsA;
	index_array[0xB] = bitsB;
	index_array[0xC] = bitsC;
	index_array[0xD] = bitsD;
	index_array[0xE] = bitsE;
	index_array[0xF] = bitsF;
}

void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8]) const
{
	if (alpha0 > alpha1) {
		evaluatePalette8(alpha);
	}
	else {
		evaluatePalette6(alpha);
	}
}


void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8]) const
{
	// 8-alpha block:  derive the other six alphas.
	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
	alpha[0] = alpha0;
	alpha[1] = alpha1;
	alpha[2] = (uint8)((6 * alpha0 + 1 * alpha1) / 7);	// bit code 010
	alpha[3] = (uint8)((5 * alpha0 + 2 * alpha1) / 7);	// bit code 011
	alpha[4] = (uint8)((4 * alpha0 + 3 * alpha1) / 7);	// bit code 100
	alpha[5] = (uint8)((3 * alpha0 + 4 * alpha1) / 7);	// bit code 101
	alpha[6] = (uint8)((2 * alpha0 + 5 * alpha1) / 7);	// bit code 110
	alpha[7] = (uint8)((1 * alpha0 + 6 * alpha1) / 7);	// bit code 111
}


void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8]) const
{
	// 6-alpha block.
	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
	alpha[0] = alpha0;
	alpha[1] = alpha1;
	alpha[2] = (uint8)((4 * alpha0 + 1 * alpha1) / 5);	// Bit code 010
	alpha[3] = (uint8)((3 * alpha0 + 2 * alpha1) / 5);	// Bit code 011
	alpha[4] = (uint8)((2 * alpha0 + 3 * alpha1) / 5);	// Bit code 100
	alpha[5] = (uint8)((1 * alpha0 + 4 * alpha1) / 5);	// Bit code 101
	alpha[6] = 0x00;							// Bit code 110
	alpha[7] = 0xFF;							// Bit code 111
}

void AlphaBlockDXT5::setIndex(uint index, uint value)
{
	//	nvDebugCheck(index < 16);
	//	nvDebugCheck(value < 8);

	int offset = (3 * index + 16);
	uint64 mask = uint64(0x7) << offset;
	this->u = (this->u & ~mask) | (uint64(value) << offset);
}

static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block)
{
	uint8 alphas[8];
	block->evaluatePalette(alphas);

	uint totalError = 0;

	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;

		uint besterror = 256*256;
		uint best;
		for(uint p = 0; p < 8; p++)
		{
			int d = alphas[p] - alpha;
			uint error = d * d;

			if (error < besterror)
			{
				besterror = error;
				best = p;
			}
		}

		totalError += besterror;
	}

	return totalError;
}


inline static Color32 loadColor(Color32 c)
{
	return c;
}

inline static uint sqr(uint s)
{
	return s*s;
}

// Get the absolute distance between the given colors.
inline static uint colorDistance(Color32 c0, Color32 c1)
{
	return sqr(c0.r - c1.r) + sqr(c0.g - c1.g) + sqr(c0.b - c1.b);
	//return abs(c0.r - c1.r) + abs(c0.g - c1.g) + abs(c0.b - c1.b);
}

/*
/// Return the maximum of two values.
template <typename T> 
inline const T & nvmax(const T & a, const T & b)
{
	//return std::max(a, b);
	if( a < b ) {
		return b; 
	}
	return a;
}



/// Return the minimum of two values.
template <typename T> 
inline const T & nvmin(const T & a, const T & b)
{
	//return std::min(a, b);
	if( b < a ) {
		return b; 
	}
	return a;
}
*/






/// Swap two values.
template <typename T> 
inline void swap(T & a, T & b)
{
	//return std::swap(a, b);
	T temp = a; 
	a = b; 
	b = temp;
}

inline void vectorEnd()
{
}

inline static uint paletteError(const ColorBlock & rgba, Color32 palette[4])
{
	uint error = 0;

	const VectorColor vcolor0 = loadColor(palette[0]);
	const VectorColor vcolor1 = loadColor(palette[1]);
	const VectorColor vcolor2 = loadColor(palette[2]);
	const VectorColor vcolor3 = loadColor(palette[3]);

	for(uint i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));

		uint d0 = colorDistance(vcolor, vcolor0);
		uint d1 = colorDistance(vcolor, vcolor1);
		uint d2 = colorDistance(vcolor, vcolor2);
		uint d3 = colorDistance(vcolor, vcolor3);

		error += min(min(d0, d1), min(d2, d3));
	}

	vectorEnd();
	return error;
}


uint AlphaBlockDXT5::index(uint index) const
{
	//	nvDebugCheck(index < 16);

	int offset = (3 * index + 16);
	return (uint)((this->u >> offset) & 0x7);
}


static bool sameIndices(const AlphaBlockDXT5 & block0, const AlphaBlockDXT5 & block1)
{
	const uint64 mask = ~uint64(0xFFFF);
	return (block0.u | mask) == (block1.u | mask);
}



static uint computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 alphas[8];
	block->evaluatePalette(alphas);

	uint totalError = 0;

	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;

		uint besterror = 256*256;
		uint best;
		for(uint p = 0; p < 8; p++)
		{
			int d = alphas[p] - alpha;
			uint error = d * d;

			if (error < besterror)
			{
				besterror = error;
				best = p;
			}
		}

		totalError += besterror;
		block->setIndex(i, best);
	}

	return totalError;
}


static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	float alpha2_sum = 0;
	float beta2_sum = 0;
	float alphabeta_sum = 0;
	float alphax_sum = 0;
	float betax_sum = 0;

	for (int i = 0; i < 16; i++)
	{
		uint idx = block->index(i);
		float alpha;
		if (idx < 2) alpha = 1.0f - idx;
		else alpha = (8.0f - idx) / 7.0f;

		float beta = 1 - alpha;

		alpha2_sum += alpha * alpha;
		beta2_sum += beta * beta;
		alphabeta_sum += alpha * beta;
		alphax_sum += alpha * rgba.color(i).a;
		betax_sum += beta * rgba.color(i).a;
	}

	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

	float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
	uint alpha1 = uint(min(max(b, 0.0f), 255.0f));

	if (alpha0 < alpha1)
	{
		swap(alpha0, alpha1);

		// Flip indices:
		for (int i = 0; i < 16; i++)
		{
			uint idx = block->index(i);
			if (idx < 2) block->setIndex(i, 1 - idx);
			else block->setIndex(i, 9 - idx);
		}
	}
	else if (alpha0 == alpha1)
	{
		for (int i = 0; i < 16; i++)
		{
			block->setIndex(i, 0);
		}
	}

	block->alpha0 = alpha0;
	block->alpha1 = alpha1;
}


// alpha only
uint compressBlock_Iterative(const ColorBlock & rgba, AlphaBlockDXT5 * resultblock)
{
	uint8 alpha0 = 0;
	uint8 alpha1 = 255;

	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		alpha0 = max(alpha0, alpha);
		alpha1 = min(alpha1, alpha);
	}

	AlphaBlockDXT5 block;
	block.alpha0 = alpha0 - (alpha0 - alpha1) / 34;
	block.alpha1 = alpha1 + (alpha0 - alpha1) / 34;
	uint besterror = computeAlphaIndices(rgba, &block);

	AlphaBlockDXT5 bestblock = block;

	while(true)
	{
		optimizeAlpha8(rgba, &block);
		uint error = computeAlphaIndices(rgba, &block);

		if (error >= besterror)
		{
			// No improvement, stop.
			break;
		}
		if (sameIndices(block, bestblock))
		{
			bestblock = block;
			break;
		}

		besterror = error;
		bestblock = block;
	};

	// Copy best block to result;
	*resultblock = bestblock;

	return besterror;
}


// alpha only
uint compressBlock_BoundsRange(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 alpha0 = 0;
	uint8 alpha1 = 255;

	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		alpha0 = max(alpha0, alpha);
		alpha1 = min(alpha1, alpha);
	}

	alpha0 = alpha0 - (alpha0 - alpha1) / 32;
	alpha1 = alpha1 + (alpha0 - alpha1) / 32;

	AlphaBlockDXT5 block0;
	block0.alpha0 = alpha0;
	block0.alpha1 = alpha1;
	uint error0 = computeAlphaIndices(rgba, &block0);

	AlphaBlockDXT5 block1;
	block1.alpha0 = alpha1;
	block1.alpha1 = alpha0;
	uint error1 = computeAlphaIndices(rgba, &block1);

	if (error0 < error1)
	{
		*block = block0;
		return error0;
	}
	else
	{
		*block = block1;
		return error1;
	}
}



inline static uint computeIndices(const ColorBlock & rgba, const Color32 palette[4])
{
	const VectorColor vcolor0 = loadColor(palette[0]);
	const VectorColor vcolor1 = loadColor(palette[1]);
	const VectorColor vcolor2 = loadColor(palette[2]);
	const VectorColor vcolor3 = loadColor(palette[3]);

	uint indices = 0;
	for(int i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));

		uint d0 = colorDistance(vcolor0, vcolor);
		uint d1 = colorDistance(vcolor1, vcolor);
		uint d2 = colorDistance(vcolor2, vcolor);
		uint d3 = colorDistance(vcolor3, vcolor);

		/*if (d0 < d1 && d0 < d2 && d0 < d3) {
		indices |= 0 << (2 * i);
		}
		else if (d1 < d2 && d1 < d3) {
		indices |= 1 << (2 * i);
		}
		else if (d2 < d3) {
		indices |= 2 << (2 * i);
		}
		else {
		indices |= 3 << (2 * i);
		}*/

		/*
		uint b0 = d0 > d2;
		uint b1 = d1 > d3;
		uint b2 = d0 > d3;
		uint b3 = d1 > d2;
		uint b4 = d0 > d1;
		uint b5 = d2 > d3;

		uint x0 = b1 & b2;
		uint x1 = b0 & b3;
		uint x2 = b2 & b5;
		uint x3 = !b3 & b4;

		indices |= ((x3 | x2) | ((x1 | x0) << 1)) << (2 * i);
		*/

		uint b0 = d0 > d3;
		uint b1 = d1 > d2;
		uint b2 = d0 > d2;
		uint b3 = d1 > d3;
		uint b4 = d2 > d3;

		uint x0 = b1 & b2;
		uint x1 = b0 & b3;
		uint x2 = b0 & b4;

		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
	}

	vectorEnd();
	return indices;
}

// alpha only
uint compressBlock_BruteForce(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 mina = 255;
	uint8 maxa = 0;

	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		mina = min(mina, alpha);
		maxa = max(maxa, alpha);
	}

	block->alpha0 = maxa;
	block->alpha1 = mina;

	/*int centroidDist = 256;
	int centroid;

	// Get the closest to the centroid.
	for (uint i = 0; i < 16; i++)
	{
	uint8 alpha = rgba.color(i).a;
	int dist = abs(alpha - (maxa + mina) / 2);
	if (dist < centroidDist)
	{
	centroidDist = dist;
	centroid = alpha;
	}
	}*/

	if (maxa - mina > 8)
	{
		int besterror = computeAlphaError(rgba, block);
		int besta0 = maxa;
		int besta1 = mina;

		for (int a0 = mina+9; a0 < maxa; a0++)
		{
			for (int a1 = mina; a1 < a0-8; a1++)
				//for (int a1 = mina; a1 < maxa; a1++)
			{
				//nvCheck(abs(a1-a0) > 8);

				//if (abs(a0 - a1) < 8) continue;
				//if ((maxa-a0) + (a1-mina) + nvmin(abs(centroid-a0), abs(centroid-a1)) > besterror)
				if ((maxa-a0) + (a1-mina) > besterror)
					continue;

				block->alpha0 = a0;
				block->alpha1 = a1;
				int error = computeAlphaError(rgba, block);

				if (error < besterror)
				{
					besterror = error;
					besta0 = a0;
					besta1 = a1;
				}
			}
		}

		block->alpha0 = besta0;
		block->alpha1 = besta1;
	}

	return computeAlphaIndices(rgba, block);
}

// -----------------------------------------------------------



// Compressor that uses bounding box.
void compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.boundsRange(&c1, &c0);

	block->col0 = toColor16(c0);
	block->col1 = toColor16(c1);

//	nvDebugCheck(block->col0.u >= block->col1.u);

	// Use 4 color mode only.
	//if (block->col0.u < block->col1.u) {
	//	swap(block->col0.u, block->col1.u);
	//}

	Color32 palette[4];
	block->evaluatePalette4(palette);

	block->indices = computeIndices(rgba, palette);
}



// Compressor that tests all input color pairs.
void compressBlock_TestAllPairs(const ColorBlock & rgba, BlockDXT1 * block)
{
	uint best_error = uint(-1);
	Color16 best_col0, best_col1;

	Color32 palette[4];

	// Test all color pairs.
	for(uint i = 0; i < 16; i++) {
		block->col0 = toColor16(rgba.color(i));

		for(uint ii = 0; ii < 16; ii++) {
			if( i != ii ) {
				block->col1 = toColor16(rgba.color(ii));
				block->evaluatePalette(palette);

				const uint error = paletteError(rgba, palette);
				if(error < best_error) {
					best_error = error;
					best_col0 = block->col0;
					best_col1 = block->col1;
				}
			}
		}
	}

	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);

	block->indices = computeIndices(rgba, palette);
}



enum zero_t { zero };
enum identity_t { identity };

// I should probably use templates.
typedef float scalar;


class Vector3
{
public:
	typedef Vector3 const & Arg;

	Vector3();
	explicit Vector3(zero_t);
	Vector3(scalar x, scalar y, scalar z);
//	Vector3(Vector2::Arg v, scalar z);
	Vector3(Vector3::Arg v);

	const Vector3 & operator=(Vector3::Arg v);

	scalar x() const;
	scalar y() const;
	scalar z() const;

//	const Vector2 & xy() const;

	// @@ temporary... should use an explicit method?
	const scalar * ptr() const;

	void set(scalar x, scalar y, scalar z);

	Vector3 operator-() const;
	void operator+=(Vector3::Arg v);
	void operator-=(Vector3::Arg v);
	void operator*=(scalar s);
	inline void operator/=(scalar s)
	{ m_x /= s; m_y /= s; m_z /= s; }
	void operator*=(Vector3::Arg v);

	friend bool operator==(Vector3::Arg a, Vector3::Arg b);
	friend bool operator!=(Vector3::Arg a, Vector3::Arg b);

private:
	scalar m_x, m_y, m_z;
};


inline Vector3::Vector3() {}
inline Vector3::Vector3(zero_t) : m_x(0.0f), m_y(0.0f), m_z(0.0f) {}
inline Vector3::Vector3(scalar x, scalar y, scalar z) : m_x(x), m_y(y), m_z(z) {}
// inline Vector3::Vector3(Vector2::Arg v, scalar z) : m_x(v.x()), m_y(v.y()), m_z(z) {}
inline Vector3::Vector3(Vector3::Arg v) : m_x(v.x()), m_y(v.y()), m_z(v.z()) {}


inline Vector3 nvmin(Vector3::Arg a, Vector3::Arg b)
{
	return Vector3(min(a.x(), b.x()), min(a.y(), b.y()), min(a.z(), b.z()));
}

inline Vector3 nvmax(Vector3::Arg a, Vector3::Arg b)
{
	return Vector3(max(a.x(), b.x()), max(a.y(), b.y()), max(a.z(), b.z()));
}

inline const Vector3 & Vector3::operator=(Vector3::Arg v)
{
	m_x = v.m_x;
	m_y = v.m_y;
	m_z = v.m_z;
	return *this;
}

inline scalar Vector3::x() const { return m_x; } 
inline scalar Vector3::y() const { return m_y; }
inline scalar Vector3::z() const { return m_z; }
/*
inline const Vector2 & Vector3::xy() const
{
	return *(Vector2 *)this;
}
*/

inline const scalar * Vector3::ptr() const
{
	return &m_x;
}


inline void Vector3::set(scalar x, scalar y, scalar z)
{
	m_x = x;
	m_y = y;
	m_z = z;
}


inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
{
	return Vector3(a.x() - b.x(), a.y() - b.y(), a.z() - b.z());
}
inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
{
	return sub(a, b);
}

inline Vector3 Vector3::operator-() const
{
	return Vector3(-m_x, -m_y, -m_z);
}

inline void Vector3::operator+=(Vector3::Arg v)
{
	m_x += v.m_x;
	m_y += v.m_y;
	m_z += v.m_z;
}

inline void Vector3::operator-=(Vector3::Arg v)
{
	m_x -= v.m_x;
	m_y -= v.m_y;
	m_z -= v.m_z;
}

inline void Vector3::operator*=(scalar s)
{
	m_x *= s;
	m_y *= s;
	m_z *= s;
}

inline void Vector3::operator*=(Vector3::Arg v)
{
	m_x *= v.m_x;
	m_y *= v.m_y;
	m_z *= v.m_z;
}

inline bool operator==(Vector3::Arg a, Vector3::Arg b)
{
	return a.m_x == b.m_x && a.m_y == b.m_y && a.m_z == b.m_z; 
}
inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
{
	return a.m_x != b.m_x || a.m_y != b.m_y || a.m_z != b.m_z; 
}

inline Vector3 scale(Vector3::Arg v, scalar s)
{
	return Vector3(v.x() * s, v.y() * s, v.z() * s);
}
inline Vector3 operator*(scalar s, Vector3::Arg v)
{
	return scale(v, s);
}

inline Vector3 operator*(Vector3::Arg v, scalar s)
{
	return scale(v, s);
}


inline Vector3 toVector3(Color32 c)
{
	const float scale = 1.0f / 255.0f;
	return Vector3(c.r * scale, c.g * scale, c.b * scale);
}




uint blockError(const ColorBlock & rgba, const BlockDXT1 & block)
{
	Color32 palette[4];
	block.evaluatePalette(palette);

	VectorColor vcolors[4];
	vcolors[0] = loadColor(palette[0]);
	vcolors[1] = loadColor(palette[1]);
	vcolors[2] = loadColor(palette[2]);
	vcolors[3] = loadColor(palette[3]);

	uint error = 0;
	for(uint i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));

		int idx = (block.indices >> (2 * i)) & 3;

		uint d = colorDistance(vcolor, vcolors[idx]);
		error += d;
	}

	//nvDebugCheck(error == paletteError(rgba, palette));

	vectorEnd();
	return error;
}


uint blockError(const ColorBlock & rgba, const AlphaBlockDXT5 & block)
{
	uint8 palette[8];
	block.evaluatePalette(palette);

	uint8 indices[16];
	block.indices(indices);

	uint error = 0;
	for(uint i = 0; i < 16; i++) {
		int d = palette[indices[i]] - rgba.color(i).a;
		error += uint(d * d);
	}

	return error;
}


void optimizeEndPoints(const ColorBlock & rgba, BlockDXT1 * block)
{
	float alpha2_sum = 0.0f;
	float beta2_sum = 0.0f;
	float alphabeta_sum = 0.0f;
	Vector3 alphax_sum(zero);
	Vector3 betax_sum(zero);

	for( int i = 0; i < 16; ++i )
	{
		const uint bits = block->indices >> (2 * i);

		float beta = (bits & 1);
		if (bits & 2) beta = (1 + beta) / 3.0f;
		float alpha = 1.0f - beta;

		const Vector3 x = toVector3(rgba.color(i));

		alpha2_sum += alpha * alpha;
		beta2_sum += beta * beta;
		alphabeta_sum += alpha * beta;
		alphax_sum += alpha * x;
		betax_sum += beta * x;
	}

	float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	Vector3 zero1(0, 0, 0);
	Vector3 one1(1, 1, 1);
	a = nvmin(one1, nvmax(zero1, a));
	b = nvmin(one1, nvmax(zero1, b));

	BlockDXT1 B;

	// Round a,b to 565.
	B.col0.r = uint16(a.x() * 31);
	B.col0.g = uint16(a.y() * 63);
	B.col0.b = uint16(a.z() * 31);
	B.col1.r = uint16(b.x() * 31);
	B.col1.g = uint16(b.y() * 63);
	B.col1.b = uint16(b.z() * 31);
	B.indices = block->indices;

	// Force 4 color mode.
	if (B.col0.u < B.col1.u)
	{
		swap(B.col0.u, B.col1.u);
		B.indices ^= 0x55555555;
	}
	else if (B.col0.u == B.col1.u)
	{
		block->indices = 0;
	}

	if (blockError(rgba, B) < blockError(rgba, *block))
	{
		*block = B;
	}
}
// ----------------------------------
// ----------------------------------
// ----------------------------------
// ----------------------------------
// ----------------------------------
// ----------------------------------	
// ----------------------------------
// ----------------------------------


CSimpleDXTCompressor::CSimpleDXTCompressor()
{
}

CSimpleDXTCompressor::~CSimpleDXTCompressor()
{
}


void CSimpleDXTCompressor::CompressBlock4x4( const uint32 dwCol[16], uint8 outBlock[16] )
{
	ColorBlock in(dwCol);
	BlockDXT5 out_dxt5;

	// color only
//	compressBlock_TestAllPairs(in,&out_dxt5.color);			// handles 2 colors in the block quite badly, bug?
	compressBlock_BoundsRange(in,&out_dxt5.color);

	optimizeEndPoints(in, &out_dxt5.color);			// does it improve ?

	// alpha only
//	compressBlock_BruteForce(in,&out_alpha);		
//	compressBlock_Iterative(in,&out_alpha);
	compressBlock_BoundsRange(in,&out_dxt5.alpha);

//	memcpy(&outBlock[0],&out_alpha,16);
	memcpy(&outBlock[0],&out_dxt5,16);
}


