/*
 *	This is a simple test of PS3 Altivec matrix performance
 *  
 *  Check main.s for the generated assembly
 *
 *  Author: Jay Ryness
 *
 *  Copyright 2006 Sony Online Entertainment.
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <Altivec.h>

// Altivec.h doesn't force these inline, so we must
inline vector float vec_madd (vector float a1, vector float a2, vector float a3) __attribute__ ((always_inline));
inline vector float vec_ld (OFFSET_T a1, const vector float *a2) __attribute__ ((always_inline));
inline vector float vec_ld (OFFSET_T a1, const float *a2) __attribute__ ((always_inline));
inline void vec_st (vector float a1, OFFSET_T a2, vector float *a3) __attribute__ ((always_inline));
inline void vec_st (vector float a1, OFFSET_T a2, float *a3) __attribute__ ((always_inline));

#include "clock.h"
#include "performance_clock.h"

#define ITERATIONS (1000*1000)
#define ARRAY_SIZE 16               // must be power of 2

// Setup timers
CLOCK( CLOCK_MUL,           "this = A*B"            );
CLOCK( CLOCK_MULSTACK,      "return A*B"            );
CLOCK( CLOCK_MUL3,          "this = A*B*C"          );
CLOCK( CLOCK_MUL3STACK,     "return A*B*C"          );
CLOCK( CLOCK_MULVMX,        "this = A*B   (VMX)"    );
CLOCK( CLOCK_MULVMXSTACK,   "return A*B   (VMX)"    );
CLOCK( CLOCK_MUL3VMX,       "this = A*B*C (VMX)"    );
CLOCK( CLOCK_MUL3VMXSTACK,  "return A*B*C (VMX)"    );

struct Vec4
{
    Vec4()
    {
    }
    inline Vec4(float A)
    {
        X[0] = A;
        X[1] = A;
        X[2] = A;
        X[3] = A;
    }
    inline Vec4(float A, float B, float C, float D) __attribute__ ((always_inline))
    {
        X[0] = A;
        X[1] = B;
        X[2] = C;
        X[3] = D;
    }

    // Following are functions that are used by the matrix mul routine.
    // The first versions output the results to "this", while the following
    // versions return the results on the stack.

    inline void Mul(const Vec4& Value1, float Scale) __attribute__ ((always_inline))
    {
        X[0] = Value1.X[0] * Scale;  
        X[1] = Value1.X[1] * Scale;  
        X[2] = Value1.X[2] * Scale;  
        X[3] = Value1.X[3] * Scale;  
    }

    inline void MulAdd(const Vec4& Value1, float Scale) __attribute__ ((always_inline))
    {
        X[0] += (Value1.X[0] * Scale);  
        X[1] += (Value1.X[1] * Scale);  
        X[2] += (Value1.X[2] * Scale);  
        X[3] += (Value1.X[3] * Scale);
    }

    // These are the stack versions

    friend inline Vec4 Mul(const Vec4& Value1, float Scale) __attribute__ ((always_inline))
    {
        return Vec4( Value1.X[0] * Scale, Value1.X[1] * Scale, Value1.X[2] * Scale, Value1.X[3] * Scale );
    }

    friend inline Vec4 MulAdd(const Vec4& Value1, const Vec4& Value2, float Scale) __attribute__ ((always_inline))
    {
        return Vec4( Value1.X[0] + Value2.X[0] * Scale, Value1.X[1] + Value2.X[1] * Scale, Value1.X[2] + Value2.X[2] * Scale, Value1.X[3] + Value2.X[3] * Scale );
    }


    //vec_float4  V;
    float       X[4];

} __attribute__ ((aligned(16)));

struct Mat44
{
    Mat44()
    {
    }
    inline Mat44(float A)
    {
        Vec4 X = Vec4(A);
        R[0] = X;
        R[1] = X;
        R[2] = X;
        R[3] = X;
    }

    inline Mat44(const Vec4& R1, const Vec4& R2, const Vec4& R3, const Vec4& R4) __attribute__ ((always_inline))
    {
        R[0] = R1;
        R[1] = R2;
        R[2] = R3;
        R[3] = R4;
    }

    // Multiply the input matrices and store results in "this"

    inline void Mul(const Mat44& A, const Mat44& B) __attribute__ ((always_inline))
    {
        R[0].Mul(    B.R[0], A.R[0].X[0] );
        R[0].MulAdd( B.R[1], A.R[0].X[1] );
        R[0].MulAdd( B.R[2], A.R[0].X[2] );
        R[0].MulAdd( B.R[3], A.R[0].X[3] );

        R[1].Mul(    B.R[0], A.R[1].X[0] );
        R[1].MulAdd( B.R[1], A.R[1].X[1] );
        R[1].MulAdd( B.R[2], A.R[1].X[2] );
        R[1].MulAdd( B.R[3], A.R[1].X[3] );

        R[2].Mul(    B.R[0], A.R[2].X[0] );
        R[2].MulAdd( B.R[1], A.R[2].X[1] );
        R[2].MulAdd( B.R[2], A.R[2].X[2] );
        R[2].MulAdd( B.R[3], A.R[2].X[3] );

        R[3].Mul(    B.R[0], A.R[3].X[0] );
        R[3].MulAdd( B.R[1], A.R[3].X[1] );
        R[3].MulAdd( B.R[2], A.R[3].X[2] );
        R[3].MulAdd( B.R[3], A.R[3].X[3] );
    };

    // Multiply the input matrices and return results on the stack

    friend inline Mat44 Mul(const Mat44& A, const Mat44& B) __attribute__ ((always_inline))
    {
        return  Mat44(  MulAdd( MulAdd( MulAdd( ::Mul( B.R[0], A.R[0].X[0] ), B.R[1], A.R[0].X[1] ), B.R[2], A.R[0].X[2] ), B.R[3], A.R[0].X[3] ),
                        MulAdd( MulAdd( MulAdd( ::Mul( B.R[0], A.R[1].X[0] ), B.R[1], A.R[1].X[1] ), B.R[2], A.R[1].X[2] ), B.R[3], A.R[1].X[3] ),
                        MulAdd( MulAdd( MulAdd( ::Mul( B.R[0], A.R[2].X[0] ), B.R[1], A.R[2].X[1] ), B.R[2], A.R[2].X[2] ), B.R[3], A.R[2].X[3] ),
                        MulAdd( MulAdd( MulAdd( ::Mul( B.R[0], A.R[3].X[0] ), B.R[1], A.R[3].X[1] ), B.R[2], A.R[3].X[2] ), B.R[3], A.R[3].X[3] ) );
    };

    // Multiply the input matrices using VMX and output to "this"

    inline void MulVMX(const Mat44& A, const Mat44& B) __attribute__ ((always_inline))
    {
        //Load the matrix rows
        const vec_float4 A1 = vec_ld( 0 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A2 = vec_ld( 1 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A3 = vec_ld( 2 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A4 = vec_ld( 3 * sizeof(vec_float4), (vec_float4*)&A );

        const vec_float4 B1 = vec_ld( 0 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B2 = vec_ld( 1 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B3 = vec_ld( 2 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B4 = vec_ld( 3 * sizeof(vec_float4), (vec_float4*)&B );

        const vec_float4 Zero = (vec_float4)vec_splat_u32(0);

        vec_float4 C1, C2, C3, C4;

        //Do the first scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 0 ), B1, Zero );
        C2 = vec_madd( vec_splat( A2, 0 ), B1, Zero );
        C3 = vec_madd( vec_splat( A3, 0 ), B1, Zero );
        C4 = vec_madd( vec_splat( A4, 0 ), B1, Zero );

        //Accumulate in the second scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 1 ), B2, C1 );
        C2 = vec_madd( vec_splat( A2, 1 ), B2, C2 );
        C3 = vec_madd( vec_splat( A3, 1 ), B2, C3 );
        C4 = vec_madd( vec_splat( A4, 1 ), B2, C4 );

        //Accumulate in the third scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 2 ), B3, C1 );
        C2 = vec_madd( vec_splat( A2, 2 ), B3, C2 );
        C3 = vec_madd( vec_splat( A3, 2 ), B3, C3 );
        C4 = vec_madd( vec_splat( A4, 2 ), B3, C4 );

        //Accumulate in the fourth scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 3 ), B4, C1 );
        C2 = vec_madd( vec_splat( A2, 3 ), B4, C2 );
        C3 = vec_madd( vec_splat( A3, 3 ), B4, C3 );
        C4 = vec_madd( vec_splat( A4, 3 ), B4, C4 );

        //Store out the result
        vec_st( C1, 0 * sizeof(vec_float4), (vec_float4*)this );
        vec_st( C2, 1 * sizeof(vec_float4), (vec_float4*)this );
        vec_st( C3, 2 * sizeof(vec_float4), (vec_float4*)this );
        vec_st( C4, 3 * sizeof(vec_float4), (vec_float4*)this );
    }

    // Multiply the input matrices using VMX and return results on the stack

    friend inline Mat44 MulVMX(const Mat44& A, const Mat44& B) __attribute__ ((always_inline))
    {
        //Load the matrix rows
        const vec_float4 A1 = vec_ld( 0 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A2 = vec_ld( 1 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A3 = vec_ld( 2 * sizeof(vec_float4), (vec_float4*)&A );
        const vec_float4 A4 = vec_ld( 3 * sizeof(vec_float4), (vec_float4*)&A );

        const vec_float4 B1 = vec_ld( 0 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B2 = vec_ld( 1 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B3 = vec_ld( 2 * sizeof(vec_float4), (vec_float4*)&B );
        const vec_float4 B4 = vec_ld( 3 * sizeof(vec_float4), (vec_float4*)&B );

        const vec_float4 Zero = (vec_float4)vec_splat_u32(0);

        vec_float4 C1, C2, C3, C4;

        //Do the first scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 0 ), B1, Zero );
        C2 = vec_madd( vec_splat( A2, 0 ), B1, Zero );
        C3 = vec_madd( vec_splat( A3, 0 ), B1, Zero );
        C4 = vec_madd( vec_splat( A4, 0 ), B1, Zero );

        //Accumulate in the second scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 1 ), B2, C1 );
        C2 = vec_madd( vec_splat( A2, 1 ), B2, C2 );
        C3 = vec_madd( vec_splat( A3, 1 ), B2, C3 );
        C4 = vec_madd( vec_splat( A4, 1 ), B2, C4 );

        //Accumulate in the third scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 2 ), B3, C1 );
        C2 = vec_madd( vec_splat( A2, 2 ), B3, C2 );
        C3 = vec_madd( vec_splat( A3, 2 ), B3, C3 );
        C4 = vec_madd( vec_splat( A4, 2 ), B3, C4 );

        //Accumulate in the fourth scalar x vector multiply for each row
        C1 = vec_madd( vec_splat( A1, 3 ), B4, C1 );
        C2 = vec_madd( vec_splat( A2, 3 ), B4, C2 );
        C3 = vec_madd( vec_splat( A3, 3 ), B4, C3 );
        C4 = vec_madd( vec_splat( A4, 3 ), B4, C4 );

        //Store out the result
        Mat44 Result;
        vec_st( C1, 0 * sizeof(vec_float4), (vec_float4*)&Result );
        vec_st( C2, 1 * sizeof(vec_float4), (vec_float4*)&Result );
        vec_st( C3, 2 * sizeof(vec_float4), (vec_float4*)&Result );
        vec_st( C4, 3 * sizeof(vec_float4), (vec_float4*)&Result );
        return Result;
    }

    Vec4    R[4];

} __attribute__ ((aligned(16)));

// This will be used mainly to force the optimizer not to throw out all our work
void PrintResults( const Mat44* pResults );

// These just give us convenient labels to search for in the assembly output (main.s)
void LoopStart( void );
void LoopEnd( void );

// Generate a float between -1.0 and 1.0
static inline float RandFloat(void)
{
    return 1.0f - 2.0f * (float)rand() / RAND_MAX;
}

int main(void)
{
    Vec4 Zero(0.0f);
    Mat44 A[ARRAY_SIZE], B[ARRAY_SIZE], C[ARRAY_SIZE], Result[ARRAY_SIZE];

    int i;

    for (i=0; i<ARRAY_SIZE; i++)
    {
        A[i] = Mat44(   Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()) );

        B[i] = Mat44(   Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()) );

        C[i] = Mat44(   Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()),
                        Vec4(RandFloat(), RandFloat(), RandFloat(), RandFloat()) );
    }

    // Stall a little while, just to be safe before doing any timing
    for (i=0; i<ITERATIONS*20; i++)
    {
        Result[i&(ARRAY_SIZE-1)].Mul(A[(i+1)&(ARRAY_SIZE-1)], B[(ITERATIONS-i)&(ARRAY_SIZE-1)]);
    }

    printf("sizeof(Vec4) = %d\n", sizeof(Vec4));
    printf("sizeof(Mat44) = %d\n", sizeof(Mat44));

    printf("********************************************************************************\n");

    printf("\n* Testing normal Mul *\n");

    CLOCK_START(CLOCK_MUL);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)].Mul(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MUL);
    PrintResults(Result);

    printf("\n* Testing normal Mul stack version *\n");

    CLOCK_START(CLOCK_MULSTACK);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)] = Mul(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MULSTACK);
    PrintResults(Result);

    printf("\n* Testing normal Mul 3 operands *\n");

    CLOCK_START(CLOCK_MUL3);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)].Mul(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
        Result[i&(ARRAY_SIZE-1)].Mul(Result[i&(ARRAY_SIZE-1)], C[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MUL3);
    PrintResults(Result);

    printf("\n* Testing normal Mul 3 operands stack version *\n");

    CLOCK_START(CLOCK_MUL3STACK);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)] = Mul( Mul(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]), C[i&(ARRAY_SIZE-1)] );
    }
    LoopEnd();
    CLOCK_END(CLOCK_MUL3STACK);
    PrintResults(Result);

    printf("\n* Testing VMX Mul *\n");

    CLOCK_START(CLOCK_MULVMX);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)].MulVMX(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MULVMX);
    PrintResults(Result);

    printf("\n* Testing VMX Mul stack version *\n");

    CLOCK_START(CLOCK_MULVMXSTACK);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)] = MulVMX(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MULVMXSTACK);
    PrintResults(Result);

    printf("\n* Testing VMX Mul 3 operands *\n");

    CLOCK_START(CLOCK_MUL3VMX);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)].MulVMX(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]);
        Result[i&(ARRAY_SIZE-1)].MulVMX(Result[i&(ARRAY_SIZE-1)], C[i&(ARRAY_SIZE-1)]);
    }
    LoopEnd();
    CLOCK_END(CLOCK_MUL3VMX);
    PrintResults(Result);

    printf("\n* Testing VMX Mul 3 operands stack version *\n");

    CLOCK_START(CLOCK_MUL3VMXSTACK);
    LoopStart();
    for (i=0; i<ITERATIONS; i++)
    {
        Result[i&(ARRAY_SIZE-1)] = MulVMX( MulVMX(A[i&(ARRAY_SIZE-1)], B[i&(ARRAY_SIZE-1)]), C[i&(ARRAY_SIZE-1)] );
    }
    LoopEnd();
    CLOCK_END(CLOCK_MUL3VMXSTACK);
    PrintResults(Result);

    printf("********************************************************************************\n");

    printf("Report:\n");
    CLOCK_REPORT_AND_RESET( CLOCK_MUL,          1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MULSTACK,     1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MUL3,         1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MUL3STACK,    1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MULVMX,       1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MULVMXSTACK,  1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MUL3VMX,      1.0f/ITERATIONS );
    CLOCK_REPORT_AND_RESET( CLOCK_MUL3VMXSTACK, 1.0f/ITERATIONS );

    printf("********************************************************************************\n");


    return 0;
}

void PrintResults( const Mat44* pResults )
{
    int i;
    for (i=0; i<ARRAY_SIZE; i++)
    {
        printf("\n");
        printf("%7.4f %7.4f %7.4f %7.4f\n", pResults[i].R[0].X[0], pResults[i].R[0].X[1], pResults[i].R[0].X[2], pResults[i].R[0].X[3] );
        printf("%7.4f %7.4f %7.4f %7.4f\n", pResults[i].R[1].X[0], pResults[i].R[1].X[1], pResults[i].R[1].X[2], pResults[i].R[1].X[3] );
        printf("%7.4f %7.4f %7.4f %7.4f\n", pResults[i].R[2].X[0], pResults[i].R[2].X[1], pResults[i].R[2].X[2], pResults[i].R[2].X[3] );
        printf("%7.4f %7.4f %7.4f %7.4f\n", pResults[i].R[3].X[0], pResults[i].R[3].X[1], pResults[i].R[3].X[2], pResults[i].R[3].X[3] );
    }
}

// These functions aren't supposed to do anything useful

void LoopStart( void )
{
    rand();
}

void LoopEnd( void )
{
    rand();
}
