/*
 *  This is just an SPU exercise. It draws some Flora, but not necessarily in a
 * way that is compatible with the current Flora system in the game.
 *  
 *  Author: Jay Ryness
 *
 *  Copyright 2006 Sony Online Entertainment.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <sys/spu_initialize.h>
#include <sys/spu_thread.h>
#include <sys/spu_thread_group.h>
#include <sys/spu_image.h>
#include <sys/ppu_thread.h>
#include <sys/sys_time.h>
#include <spu_printf.h>

#include <sys/event.h>
#include <sys/process.h>
#include <sys/paths.h>

#include <PSGL/psgl.h>
#include <PSGL/psglu.h>

#include <cell/pad.h>

#include <libsntuner.h>

#include <vectormath/cpp/vectormath_aos.h>
using namespace Vectormath::Aos;

#include <FloraSPU\SyFloraSPU.h>

#include "tutorialDDSloader.h"

#define PI 3.14159265358979

#define FLORA_COUNT             (96*1024)
#define FLORA_OUTPUT_VERTS      (1024*1024-1)

#define TERRAIN_SIZE            200.f
#define TERRAIN_HEIGHT          40.f

#define NOISE_OCTAVES           5

#define NEAR_CLIP               0.1f
#define FAR_CLIP                200.0f

// These are for the event queue that the PPU uses to send events to the SPU
#define SPU_BOUND_EVENT_QUEUE_KEY 0x92311293UL
#define SPU_BOUND_EVENT_QUEUE_SIZE 8
#define SPU_BOUND_EVENT_QUEUE_PORT 0x10

// For communication from SPU to PPU
#define PPU_BOUND_EVENT_QUEUE_KEY 0x92311294UL
#define PPU_BOUND_EVENT_QUEUE_SIZE 8
#define PPU_BOUND_EVENT_QUEUE_PORT 0x10

// These are for the SPU printf queue
#define SPU_PRINTF_EVENT_QUEUE_KEY 0x03138561UL
#define SPU_PRINTF_EVENT_QUEUE_SIZE 8
#define SPU_PRINTF_EVENT_QUEUE_PORT 0x1
#define SPU_PRINTF_THREAD_PRIO 1001
#define SPU_PRINTF_THREAD_STACK_SIZE (64 * 1024)

// These handy little macros saves me from writing a bunch of nearly identical error printf code
#define SAFE_CALL( f, ... )       { if ( f(__VA_ARGS__) != CELL_OK )      { fprintf(stderr, "\n--> Error returned by '"#f"' <--\n\n"); exit(-1); } }
#define SAFE_CALL_GL( r, f, ... ) { r = f(__VA_ARGS__); if ( r == NULL )  { fprintf(stderr, "\n--> Error returned by '"#f"' <--\n\n"); exit(-1); } }

class FloraSet
{
  // Still need texture, material info here
  float BaseScale;
  //float ScaleVariance;
};

// Threads, events, and such
static sys_spu_thread_group_t   gSpuGroup;            // SPU thread group ID
static sys_spu_thread_t         gSpuThread;           // SPU thread ID
static sys_spu_image_t          gSpuELF;              // SPU executable

static sys_event_queue_t        gSpuPrintfEventQueue; // SPU uses this queue to send spu_printf events back to PPU
static sys_ppu_thread_t         gSpuPrintfThread;     // This is a PPU thread that listens for spu_printf events

static sys_event_queue_t        gSpuBoundEventQueue;  // Event queue to talk to SPU
static sys_event_port_t         gSpuBoundEventPort;   // Port used from PPU to write to SPU
static sys_event_queue_t        gPpuBoundEventQueue;  // Event queue to hear from SPU

// Screen resolution
static int  gGLWidth;
static int  gGLHeight;

// Shaders, shader parameters, and such
static CGcontext    gCGcontext;
static CGprogram    gVertexShader;
static CGprogram    gFragmentShader;
static CGparameter  gVParmWorldViewProjMat;
static CGparameter  gVParmLightAmbient;
static CGparameter  gVParmSubTexScale;
static CGparameter  gVParmFogFar;
static CGparameter  gVParmFogRange;
static CGparameter  gFParmColorTex;
static CGparameter  gFParmFogColor;
static CGparameter  gFParmFogOn;

// The huge flora buffer that will be sent to SPU
static FloraSPUInst  gFloraInstances[FLORA_COUNT]  __attribute__ ((aligned(128)));

/*
 * This is the main function of the spu_printf service thread.
 * It listens for messages and calls the printf handler when it gets them,
 * then notifies the SPU of completion by using a mailbox.
 */

static void spuPrintfThreadMain(uint64_t arg)
{
  sys_event_t event;

  while (1)
  {
    SAFE_CALL( sys_event_queue_receive, gSpuPrintfEventQueue, &event, SYS_NO_TIMEOUT );
    sys_spu_thread_write_spu_mb( event.data1, spu_thread_printf(event.data1, event.data3) );
  }
}

/*
 * Initialize the SPU threads and event queues, and start up the SPUs
 */

void StartSPUs(void)
{
  // These are used to pass various attributes to sys calls
  sys_event_queue_attribute_t       queueAttr;
  sys_spu_thread_group_attribute_t  groupAttr;
  sys_spu_thread_attribute_t        threadAttr;
  sys_spu_thread_argument_t         threadArgs;

  // Create and configure the thread group.
  groupAttr.name   = "Flora processing group";
  groupAttr.nsize  = strlen(groupAttr.name) + 1;
  groupAttr.type   = SYS_SPU_THREAD_GROUP_TYPE_NORMAL;

  // 1 SPU thread, priority is 100
  SAFE_CALL( sys_spu_thread_group_create, &gSpuGroup, 1, 100, &groupAttr );

  // Load the thread code
  SAFE_CALL( sys_spu_image_open, &gSpuELF, SYS_APP_HOME"/floraSPU.elf" );

  // Create and initialize the thread
  threadAttr.name    = "Flora processing thread";
  threadAttr.nsize   = strlen(threadAttr.name)+1;
  threadAttr.option  = SYS_SPU_THREAD_OPTION_NONE;
  threadArgs.arg1    = SYS_SPU_THREAD_ARGUMENT_LET_32( SPU_BOUND_EVENT_QUEUE_PORT );
  threadArgs.arg2    = SYS_SPU_THREAD_ARGUMENT_LET_32( PPU_BOUND_EVENT_QUEUE_PORT );
  SAFE_CALL( sys_spu_thread_initialize, &gSpuThread, gSpuGroup, 0, &gSpuELF, &threadAttr, &threadArgs );

  // Now create and connect the SPU-bound event queue
  sys_event_queue_attribute_initialize(queueAttr);
  queueAttr.attr_protocol  = SYS_SYNC_PRIORITY;
  queueAttr.type           = SYS_SPU_QUEUE;
  SAFE_CALL( sys_event_queue_create,        &gSpuBoundEventQueue, &queueAttr, SPU_BOUND_EVENT_QUEUE_KEY, SPU_BOUND_EVENT_QUEUE_SIZE );
  SAFE_CALL( sys_event_port_create,         &gSpuBoundEventPort, SYS_EVENT_PORT_LOCAL, SPU_BOUND_EVENT_QUEUE_PORT );
  SAFE_CALL( sys_event_port_connect_local,  gSpuBoundEventPort, gSpuBoundEventQueue );
  SAFE_CALL( sys_spu_thread_bind_queue,     gSpuThread, gSpuBoundEventQueue, SPU_BOUND_EVENT_QUEUE_PORT );

  // PPU-bound event queue
  sys_event_queue_attribute_initialize(queueAttr);
  SAFE_CALL( sys_event_queue_create,        &gPpuBoundEventQueue, &queueAttr, PPU_BOUND_EVENT_QUEUE_KEY, PPU_BOUND_EVENT_QUEUE_SIZE );
  SAFE_CALL( sys_spu_thread_connect_event,  gSpuThread, gPpuBoundEventQueue, SYS_SPU_THREAD_EVENT_USER, PPU_BOUND_EVENT_QUEUE_PORT );

  // This is the event queue and handler thread for spu_printf
  sys_event_queue_attribute_initialize(queueAttr);
  SAFE_CALL( sys_event_queue_create,        &gSpuPrintfEventQueue, &queueAttr, SPU_PRINTF_EVENT_QUEUE_KEY, SPU_PRINTF_EVENT_QUEUE_SIZE );
  SAFE_CALL( sys_ppu_thread_create,         &gSpuPrintfThread, spuPrintfThreadMain, (uint64_t) 0, SPU_PRINTF_THREAD_PRIO, SPU_PRINTF_THREAD_STACK_SIZE, SYS_PPU_THREAD_CREATE_JOINABLE, "spu_printf_handler" );
  SAFE_CALL( sys_spu_thread_connect_event,  gSpuThread, gSpuPrintfEventQueue, SYS_SPU_THREAD_EVENT_USER, SPU_PRINTF_EVENT_QUEUE_PORT );

  // Fire up the SPU (finally!)
  SAFE_CALL( sys_spu_thread_group_start, gSpuGroup );
}

void InitializeGL(void)
{
  // First, initialize PSGL
  // Note that since we initialized the SPUs ourselves earlier we should
  // make sure that PSGL doesn't try to do so as well.
  PSGLinitOptions initOpts =
  {
    enable                : PSGL_INIT_MAX_SPUS | PSGL_INIT_INITIALIZE_SPUS | PSGL_INIT_HOST_MEMORY_SIZE,
    maxSPUs               : 1,
    initializeSPUs        : false,

    // We're not specifying values for these options, the code is only
    // here to alleviate compiler warnings.
    persistentMemorySize  : 0,
    transientMemorySize   : 0,
    errorConsole          : 0,
    fifoSize              : 0,

    // Put aside 64 megabytes for VBOs.
    hostMemorySize        : 64 * 1024 * 1024,
  };

  psglInit(&initOpts);

  // Next, create the device
  // Note, we'll query the actual width/height below.
  PSGLbufferParameters glBufferParams =
  {
    width         : 0,
    height        : 0,
    colorBits     : 24,
    alphaBits     : 8,
    depthBits     : 24,
    stencilBits   : 8,
    deviceType    : PSGL_DEVICE_TYPE_AUTO,
    TVStandard    : PSGL_TV_STANDARD_NONE,
    TVFormat      : PSGL_TV_FORMAT_AUTO,
    bufferingMode : PSGL_BUFFERING_MODE_DOUBLE,
    antiAliasing  : GL_TRUE,
  };

  PSGLdevice *pGLDevice;
  SAFE_CALL_GL( pGLDevice, psglCreateDevice, &glBufferParams );

  // Query the resolution that the system has been configured to output
  const PSGLbufferParameters *pConfiguredBufferParams;
  SAFE_CALL_GL( pConfiguredBufferParams, psglGetDeviceBufferParameters, pGLDevice );

  gGLWidth  = pConfiguredBufferParams->width;
  gGLHeight = pConfiguredBufferParams->height;

  printf("Video mode configured as %ix%i\n", gGLWidth, gGLHeight);

  // Now create a PSGL context
  PSGLcontext *pContext;
  SAFE_CALL_GL( pContext, psglCreateContext );

  // Make this context current for the device we initialized
  psglMakeCurrent(pContext, pGLDevice);

  // Reset the context
  psglResetCurrentContext();

  glViewport(0, 0, gGLWidth, gGLHeight);
  glScissor(0, 0, gGLWidth, gGLHeight);
  glClearColor(0.f, 0.f, 0.f, 1.f);
  glEnable(GL_DEPTH_TEST);
  glDepthFunc(GL_LEQUAL);

  // Disable VSYNC just for benchmarking reasons -- we don't want to have
  // 60fps be the lower bound for this sample -- just to measure how fast
  // it can really get.
  glDisable(GL_VSYNC_SCE);

  // PSGL doesn't clear the screen on startup, so let's do that here.
  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
  psglSwap();
}

void LoadShaders(void)
{
	gCGcontext = cgCreateContext();

  CGprofile vertexProfile   = cgGLGetLatestProfile(CG_GL_VERTEX);
  CGprofile fragmentProfile = cgGLGetLatestProfile(CG_GL_FRAGMENT);

  cgGLEnableProfile(vertexProfile);
  cgGLEnableProfile(fragmentProfile);

  /*
  * Load the vertex program and find its parameters.
  */
  SAFE_CALL_GL( gVertexShader, cgCreateProgramFromFile, gCGcontext, CG_BINARY, SYS_APP_HOME"/../application/test/floratest/floratest/flora_vs.vpo", vertexProfile, NULL, NULL );
  cgGLLoadProgram(gVertexShader);

  SAFE_CALL_GL( gVParmWorldViewProjMat, cgGetNamedParameter, gVertexShader, "WorldViewProjMat" );
  SAFE_CALL_GL( gVParmLightAmbient,     cgGetNamedParameter, gVertexShader, "LightAmbient" );
  SAFE_CALL_GL( gVParmSubTexScale,      cgGetNamedParameter, gVertexShader, "SubTexScale" );
  SAFE_CALL_GL( gVParmFogFar,           cgGetNamedParameter, gVertexShader, "FogFar" );
  SAFE_CALL_GL( gVParmFogRange,         cgGetNamedParameter, gVertexShader, "FogRange" );

  /*
  * Load the fragment program and find its parameters.
  */
  SAFE_CALL_GL( gFragmentShader, cgCreateProgramFromFile, gCGcontext, CG_BINARY, SYS_APP_HOME"/../application/test/floratest/floratest/flora_ps.fpo", fragmentProfile, NULL, NULL );

  SAFE_CALL_GL( gFParmColorTex, cgGetNamedParameter, gFragmentShader, "ColorTex" );
  SAFE_CALL_GL( gFParmFogColor, cgGetNamedParameter, gFragmentShader, "FogColor" );
  SAFE_CALL_GL( gFParmFogOn,    cgGetNamedParameter, gFragmentShader, "FogOn" );
}

inline uint32_t Random(uint32_t seed)
{
  // This is the well-known BCPL generator, notable for good spectral behavior
  return seed * 2147001325 + 715136305;
}

inline float BiLerp(float u, float v, float s00, float s01, float s10, float s11)
{
  return (1.f-u)*(1.f-v)*s00 + u*(1.f-v)*s01 + (1.f-u)*v*s10 + u*v*s11;
}

// This is used to make the flora look like it follows hilly terrain
float Noise2D(float x, float y)
{
  static const int   Octaves = NOISE_OCTAVES; // number of "mips", essentially
  static const float Decay   = 0.5f;          // how much each successive octave (higher frequency) contributes to the result

  static bool bFirstTime = true;
  static float Samples[(1 << (2*(Octaves+1)))*3/2];

  // First time through, generate the noise "textures"
  if (bFirstTime)
  {
    bFirstTime = false;

    uint32_t seed = sys_time_get_system_time();
    unsigned i;
    for (i = 0; i < sizeof(Samples)/sizeof(float); i++)
    {
      seed = Random(seed);
      Samples[i] = (seed >> 16) / 65536.f;
    }
  }

  // Remap x and y from [-0.5..+0.5] to [0..1], and clamp just to be safe
  x = fmax(0.f, fmin(255.0f/256.0f, x+0.5f));
  y = fmax(0.f, fmin(255.0f/256.0f, y+0.5f));

  float noise = 0.f;

  float     octaveScale = 1.f;
  int       octaveFreq  = 2;
  int       sampleBase  = 0;

  int i;

  // Accumulate the noise frequency octaves into the result
  for (i = 0; i < Octaves; i++)
  {
    float octaveX   = x * (float)octaveFreq;
    float octaveY   = y * (float)octaveFreq;
    uint32_t iX     = (uint32_t)octaveX;
    uint32_t iY     = (uint32_t)octaveY;

    noise += octaveScale * BiLerp(octaveX-(float)iX, octaveY-(float)iY, Samples[sampleBase + iY*octaveFreq + iX], Samples[sampleBase + iY*octaveFreq + ((iX + 1)&(octaveFreq-1))], Samples[sampleBase + ((iY + 1)&(octaveFreq-1))*octaveFreq + iX], Samples[sampleBase + ((iY + 1)&(octaveFreq-1))*octaveFreq + ((iX + 1)&(octaveFreq-1))]);

    sampleBase += octaveFreq * octaveFreq;
    octaveFreq *= 2;
    octaveScale *= Decay;
  }

  return noise;
}

float Noise2DSlopeX(float x, float y)
{
  float delta = 1.f / (float)(1 << (2+NOISE_OCTAVES));
  return (Noise2D(x+delta, y) - Noise2D(x-delta, y)) / (2.f*delta);
}

float Noise2DSlopeY(float x, float y)
{
  float delta = 1.f / (float)(1 << (2+NOISE_OCTAVES));
  return (Noise2D(x, y+delta) - Noise2D(x, y-delta)) / (2.f*delta);
}

void PopulateFloraInstances(void)
{
  int i;

  srand(sys_time_get_system_time());

  for (i = 0; i < FLORA_COUNT; i++)
  {
    float x = 0.5f - (float)rand()/(float)RAND_MAX;
    float z = 0.5f - (float)rand()/(float)RAND_MAX;
    float y = Noise2D(x, z);

    gFloraInstances[i].Position = (vec_float4){ TERRAIN_SIZE * x, TERRAIN_HEIGHT * y, TERRAIN_SIZE * z, 0.f };

    float nx = TERRAIN_HEIGHT * -Noise2DSlopeX(x, z);
    float nz = TERRAIN_HEIGHT * -Noise2DSlopeY(x, z);
    float ny = TERRAIN_HEIGHT * 4.f;
    float nn = sqrt(nx*nx + ny*ny + nz*nz);

    gFloraInstances[i].Normal = (vec_float4){ nx/nn, ny/nn, nz/nn, 0.f };
  }
}

int WaitForFloraSPUFinish(void)
{
  sys_event_t event;

  SAFE_CALL( sys_event_queue_receive, gPpuBoundEventQueue, &event, SYS_NO_TIMEOUT );

  return event.data3;
}

// This just filters the control stick input to account for dead zone, and outputs float in range [-1..1]
float PrepStickInput(int button, float deadZone)
{
  float stick = ((float)button - 128.f) / 128.f;

  stick = fabs(stick) < deadZone ? 0.f : (stick - (stick/fabs(stick)) * deadZone) / (1.f-deadZone);

  return stick;
}

float PrepButtonInput(int button)
{
  return (float)(button != 0);
}

// Translate control pad input to camera movement
int ControlCamera(const CellPadData &padData, Point3 &cameraPos, Vector3 &cameraDir)
{
  float leftStickX  = PrepStickInput(padData.button[6], 0.5f);
  float leftStickY  = PrepStickInput(padData.button[7], 0.5f);
  float rightStickX = PrepStickInput(padData.button[4], 0.5f);
  float rightStickY = PrepStickInput(padData.button[5], 0.5f);
  float L1          = PrepButtonInput((padData.button[3]>>2)&1);
  float L2          = PrepButtonInput((padData.button[3]>>0)&1);

  Vector3 strafeXDir = normalize(cross(cameraDir, Vector3(0.f, 1.f, 0.f)));
  Vector3 strafeYDir = normalize(cross(strafeXDir, cameraDir));

  cameraDir = (Matrix4(Transform3::rotation(-rightStickY*3.14159265359f/120.f, strafeXDir)) * cameraDir).getXYZ();
  cameraDir = (Matrix4(Transform3::rotationY(-rightStickX*3.14159265359f/120.f)) * cameraDir).getXYZ();
  cameraDir = normalize(cameraDir);

  cameraPos += 0.5f * (-leftStickY * cameraDir + leftStickX * strafeXDir + (L1-L2) * strafeYDir);

  // Camera altitude restriction
  float terrainHeight = TERRAIN_HEIGHT * Noise2D(cameraPos.getX()/TERRAIN_SIZE, cameraPos.getZ()/TERRAIN_SIZE);
  cameraPos.setY( fmax(cameraPos.getY(), terrainHeight) );

  return 1;
}

int main(void)
{
  printf("Hello from PPU\n");

  snTunerInit();

  // Initialize 6 SPUs but reserve 1 SPU as a raw SPU for PSGL
  sys_spu_initialize(6, 1);

  StartSPUs();

  InitializeGL();
  LoadShaders();

  PopulateFloraInstances();

  SAFE_CALL( cellPadInit, 1 );

  // Initialize the VBOs.  This is where the results will be stored.
  // Using GL_SYSTEM_DRAW_SCE ensures that the VBO resides in main memory
  // instead of RSX local memory.  This is good (in this case) because
  // the RSX can pull from main memory much faster than the Cell can push
  // to RSX.  Double buffering is used to bypass synchronization
  // difficulties between SPU (data producer) and GSX (data consumer).
  GLuint          VBOs[2];
  FloraSPUVertex  *pFloraGeometry[2];

  glGenBuffers(2, VBOs);

  glBindBuffer(GL_ARRAY_BUFFER, VBOs[0]);
  glBufferData(GL_ARRAY_BUFFER, FLORA_OUTPUT_VERTS*sizeof(FloraSPUVertex), 0, GL_SYSTEM_DRAW_SCE);
  pFloraGeometry[0] = (FloraSPUVertex*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);

  glBindBuffer(GL_ARRAY_BUFFER, VBOs[1]);
  glBufferData(GL_ARRAY_BUFFER, FLORA_OUTPUT_VERTS*sizeof(FloraSPUVertex), 0, GL_SYSTEM_DRAW_SCE);
  pFloraGeometry[1] = (FloraSPUVertex*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);

  printf("VBO buffers at %x and %x -- should be %x apart\n", (unsigned)pFloraGeometry[0], (unsigned)pFloraGeometry[1], FLORA_OUTPUT_VERTS*sizeof(FloraSPUVertex));

  GLuint texture;
  SAFE_CALL_GL( texture, tutorialLoadDDSTexture, SYS_APP_HOME"/../application/test/floratest/floratest/grass.dds" );

  // Set up the matrices
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  gluPerspectivef(90.0f, (float)gGLWidth/(float)gGLHeight, NEAR_CLIP, FAR_CLIP);

  // Set the alpha test to fail pixels that have small alpha
  glAlphaFunc(GL_GEQUAL, 192.0f/255.0f);
  glEnable(GL_ALPHA_TEST);

  // Alpha blending
  glEnable(GL_BLEND);
  glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

  // Camera
  Point3  cameraPos(0.f, 15.f, -50.f);
  Vector3 cameraDir = normalize(Vector3(0.f, 5.f, 65.f));

  int frameCounter = 0;

  while (++frameCounter)
  {
    //printf(" ** Frame #%d **\n", frameCounter);

    FloraSPUResult  spuResults[16];
    // Get the SPU started processing the data
    FloraSPUCommand spuCommands[] =
    {
      {
        Tag:                0,
        InputBufferAddr:    (uint32_t)&gFloraInstances[0],
        InputBufferSize:    FLORA_COUNT/2,
        ResultAddr:         (uint32_t)&spuResults[0],
        GeomType:           FloraGeomType_1Card,
        SubTextureU:        0.f,
        SubTextureV:        0.f,
        ScaleBase:          0.75f,
        ScaleRand:          0.25f,
        Wind:               0.125f,
        RandSeed:           0xdeadbeef,
      },
      {
        Tag:                0,
        InputBufferAddr:    (uint32_t)&gFloraInstances[FLORA_COUNT/2],
        InputBufferSize:    FLORA_COUNT/4,
        ResultAddr:         (uint32_t)&spuResults[1],
        GeomType:           FloraGeomType_2Card,
        SubTextureU:        0.f,
        SubTextureV:        0.f,
        ScaleBase:          0.5f,
        ScaleRand:          0.15f,
        Wind:               0.125f,
        RandSeed:           0xcafebabe,
      },
      {
        Tag:                0,
        InputBufferAddr:    (uint32_t)&gFloraInstances[FLORA_COUNT/2 + FLORA_COUNT/4],
        InputBufferSize:    FLORA_COUNT/6,
        ResultAddr:         (uint32_t)&spuResults[2],
        GeomType:           FloraGeomType_3Card,
        SubTextureU:        0.f,
        SubTextureV:        0.f,
        ScaleBase:          0.33f,
        ScaleRand:          0.13f,
        Wind:               0.125f,
        RandSeed:           0xdecafbad,
      },
    };

    vec_float4 matrix[4];
    glGetFloatv(GL_MODELVIEW_MATRIX , (GLfloat*)matrix);
    matrix[0] *= (vec_float4){1,1,-1,1};
    matrix[1] *= (vec_float4){1,1,-1,1};
    matrix[2] *= (vec_float4){1,1,-1,1};
    matrix[3] *= (vec_float4){1,1,-1,1};
    float hSlope = (float)gGLWidth/(float)gGLHeight;
    float vSlope = 1.0f;

    FloraSPUEnvironment spuEnviron =
    {
      OutputBufferAddr:       (uint32_t)pFloraGeometry[frameCounter&1],
      OutputBufferSize:       FLORA_OUTPUT_VERTS,
      Time:                   0.125f * (float)((double)sys_time_get_system_time() / 1000000.0),
      Camera0:                matrix[0],
      Camera1:                matrix[1],
      Camera2:                matrix[2],
      Camera3:                matrix[3],
      NearClipOffset:         NEAR_CLIP,
      FarClipOffset:          FAR_CLIP,
      TanHorizFOV:            hSlope,
      TanVertFOV:             vSlope,
      InvSinHorizFOV:         sqrt(hSlope*hSlope + 1.f) / hSlope,
      InvSinVertFOV:          sqrt(vSlope*vSlope + 1.f) / vSlope,
      StartShrinkDepth:       NEAR_CLIP,
      EndShrinkDepth:         FAR_CLIP,
    };
    const int numSPUCommands = sizeof(spuCommands) / sizeof(*spuCommands);

    //printf("cameraDir %.3f %.3f %.3f Matrix2 %.3f %.3f %.3f\n", (float)cameraDir.getX(), (float)cameraDir.getY(), (float)cameraDir.getZ(), ((float*)&matrix[0])[2], ((float*)&matrix[1])[2], ((float*)&matrix[2])[2]);

    snStartMarker(0, "SPU");

    // Kick off the flora
    sys_event_port_send(gSpuBoundEventPort, (uint32_t)&spuEnviron, (uint32_t)&spuCommands, numSPUCommands);

    // Camera movement
    CellPadData padData;
    cellPadGetData(0, &padData);
    ControlCamera(padData, cameraPos, cameraDir);
    Point3 cameraTarget = cameraPos + cameraDir;
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    gluLookAtf(cameraPos.getX(),cameraPos.getY(),cameraPos.getZ(), cameraTarget.getX(),cameraTarget.getY(),cameraTarget.getZ(), 0.f,1.f,0.f);

    // Set up shader parameters
    cgGLBindProgram(gVertexShader);
    cgGLBindProgram(gFragmentShader);

    cgGLSetStateMatrixParameter(gVParmWorldViewProjMat, CG_GL_MODELVIEW_PROJECTION_MATRIX, CG_GL_MATRIX_IDENTITY);
    cgGLSetParameter3f(gVParmLightAmbient, 1.0f, 1.0f, 1.0f);
    cgGLSetParameter2f(gVParmSubTexScale, 1.0f, 1.0f);
    cgGLSetParameter1f(gVParmFogFar, FAR_CLIP);
    cgGLSetParameter1f(gVParmFogRange, 1.0f/(FAR_CLIP-NEAR_CLIP));

    cgGLSetTextureParameter(gFParmColorTex, texture);
    cgGLSetParameter3f(gFParmFogColor, 0.0f, 0.0f, 0.0f);
    cgGLSetParameter1f(gFParmFogOn, 1.0f);

    // Render setup
    glClearColor(0.f, 0.f, 0.f, 0.f);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);

    glColor4f(1.f, 1.f, 1.f, 1.f);

    glBindBuffer(GL_ARRAY_BUFFER, VBOs[frameCounter&1]);
    //glEnableClientState(GL_TEXTURE_COORD_ARRAY);
    glEnableClientState(GL_VERTEX_ARRAY);
    //glTexCoordPointer(4, GL_FLOAT, sizeof(FloraSPUVertex), 0);
    glVertexPointer(4, GL_FLOAT, sizeof(FloraSPUVertex), 0);

    // Ready to render now, wait for the SPU to finish and then draw
    WaitForFloraSPUFinish();

    snStopMarker(0);

    snStartMarker(1, "DRAW");

    int i;

    // Do a depth only pass with alpha test >= 75% opacity
    glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
    glDepthMask(GL_TRUE);
    glAlphaFunc(GL_GEQUAL, 192.0f/255.0f);

    for (i = 0; i < numSPUCommands; i++)
    {
      if (spuResults[i].NumVerts > 0)
      {
        glDrawArrays(GL_QUADS, spuResults[i].FirstVertIdx, spuResults[i].NumVerts);
      }
    }

    // Now the color pass with no z writes
    glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
    glDepthMask(GL_FALSE);
    glAlphaFunc(GL_GEQUAL, 1.0f/255.0f);

    for (i = 0; i < numSPUCommands; i++)
    {
      if (spuResults[i].NumVerts > 0)
      {
        glDrawArrays(GL_QUADS, spuResults[i].FirstVertIdx, spuResults[i].NumVerts);
      }
    }

    snStopMarker(1);

    psglSwap();
  }
}
