/*
 * INTEL OVERLOAD
 *
 * Intel overloaded functions.
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 *
 */

/****************************************************************************
 Includes
 */

#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"

#if (defined(__ICL) && defined(RWSIMD))
#if (400 <= __ICL)

#include "rtintel.h"
#include "bodySSETransform.h"


static const char  __RWUNUSED__  rcsid[] =
    "@@(#)$Id: bodySSETransform.c,v 1.11 2001/06/12 08:55:02 johns Exp $";

/****************************************************************************
 local types
 */

typedef struct _rwSSECamInfoStruct _rwSSECamInfo;
struct _rwSSECamInfoStruct
{
    RwReal              zScale, zShift;
    RwReal              nearClip, farClip;
    RwReal              camWidth, camHeight;
    RwReal              camOffsetX, camOffsetY;
    RwReal              zBufferNear, zBufferFar;
};

/****************************************************************************
 local defines
 */

/*
 * Assumes that
 *  texture coordinate U,V pair
 * are
 * 1) contiguous
 * 2) Rt_m64 aligned
 * Is this is currently the case -- see
 */

#define CAMUVPAIRPTR(camVert)                                             \
   (&(((RxCamSpace3DVertex *) (camVert))->u))

/* Assumes U,V are packed together */
#define OBJUVPAIRPTR(objVert)                                             \
   (&(RxObjSpace3DVertexGetU((RxObjSpace3DVertex *) (objVert))))

/* Assumes U,V are packed together */
#define DEVUVPAIRPTR(devVert)                                             \
   (&(RwIm2DVertexGetU((RxScrSpace2DVertex *) (devVert))))

/****************************************************************************
 local (static) globals
 */

static _rwSSECamInfo gCamInfo;

#define _rwSSEMATRIXLOAD(_mRg, _mUp, _mAt, _mPs, _mat)                    \
MACRO_START                                                               \
{                                                                         \
    /*                                                                    \
     * Load the matrix vectors, these do not change.                      \
     */                                                                   \
    (_mRg)[0].m128 = _mm_set_ps1((_mat)->right.x);                        \
    (_mRg)[1].m128 = _mm_set_ps1((_mat)->right.y);                        \
    (_mRg)[2].m128 = _mm_set_ps1((_mat)->right.z);                        \
                                                                          \
    (_mUp)[0].m128 = _mm_set_ps1((_mat)->up.x);                           \
    (_mUp)[1].m128 = _mm_set_ps1((_mat)->up.y);                           \
    (_mUp)[2].m128 = _mm_set_ps1((_mat)->up.z);                           \
                                                                          \
    (_mAt)[0].m128 = _mm_set_ps1((_mat)->at.x);                           \
    (_mAt)[1].m128 = _mm_set_ps1((_mat)->at.y);                           \
    (_mAt)[2].m128 = _mm_set_ps1((_mat)->at.z);                           \
                                                                          \
    (_mPs)[0].m128 = _mm_set_ps1((_mat)->pos.x);                          \
    (_mPs)[1].m128 = _mm_set_ps1((_mat)->pos.y);                          \
    (_mPs)[2].m128 = _mm_set_ps1((_mat)->pos.z);                          \
}                                                                         \
MACRO_STOP

#define _rwSSECAMERALOAD(_cW,  _cH,                                       \
                         _xOff, _yOff,                                    \
                         _zScale, _zShift,                                \
                         _cam)                                            \
MACRO_START                                                               \
{                                                                         \
    /*                                                                    \
     * Load the camera matrix.                                            \
     */                                                                   \
    (_cW).m128 = _mm_set_ps1((_cam).camWidth);                            \
    (_cH).m128 = _mm_set_ps1((_cam).camHeight);                           \
                                                                          \
    (_xOff).m128 = _mm_set_ps1((_cam).camOffsetX);                        \
    (_yOff).m128 = _mm_set_ps1((_cam).camOffsetY);                        \
                                                                          \
    (_zScale).m128 = _mm_set_ps1((_cam).zScale);                          \
    (_zShift).m128 = _mm_set_ps1((_cam).zShift);                          \
                                                                          \
}                                                                         \
MACRO_STOP

#define _rwSSECLIPLOAD(_nC, _fC,                                          \
                       _xLo, _xHi,                                        \
                       _yLo, _yHi,                                        \
                       _zLo, _zHi,                                        \
                       _cam)                                              \
MACRO_START                                                               \
{                                                                         \
    RwSplitBits _split;                                                   \
                                                                          \
    (_nC).m128 = _mm_set_ps1((_cam).nearClip);                            \
    (_fC).m128 = _mm_set_ps1((_cam).farClip);                             \
                                                                          \
    _split.nUInt = (RwUInt32) rwXLOCLIP;                                  \
    (_xLo).m128 = _mm_set_ps1((_split).nReal);                            \
    _split.nUInt = (RwUInt32) rwXHICLIP;                                  \
    (_xHi).m128 = _mm_set_ps1((_split).nReal);                            \
                                                                          \
    _split.nUInt = (RwUInt32) rwYLOCLIP;                                  \
    (_yLo).m128 = _mm_set_ps1((_split).nReal);                            \
    _split.nUInt = (RwUInt32) rwYHICLIP;                                  \
    (_yHi).m128 = _mm_set_ps1((_split).nReal);                            \
                                                                          \
    _split.nUInt = (RwUInt32) rwZLOCLIP;                                  \
    (_zLo).m128 = _mm_set_ps1((_split).nReal);                            \
    _split.nUInt = (RwUInt32) rwZHICLIP;                                  \
    (_zHi).m128 = _mm_set_ps1((_split).nReal);                            \
                                                                          \
}                                                                         \
MACRO_STOP

/****************************************************************************
 _rwSSETransformSetSSEOverlayM128

 On entry   : Instanced data
 On exit    : None
 */

#define OBJVERTPTR(objVert)                                               \
    (&(((RxObjSpace3DVertex *) (objVert))->objVertex.x))

#define SSETransformSetSSEOverlayM128(objVert0, objVert1,                 \
                                      objVert2, objVert3,                 \
                                      transpose)                          \
MACRO_START                                                               \
{                                                                         \
    transpose[0].m128 = _mm_loadu_ps(&(OBJVERTPTR(objVert0)[-1]));        \
    transpose[1].m128 = _mm_loadu_ps(&(OBJVERTPTR(objVert1)[-1]));        \
    transpose[2].m128 = _mm_loadu_ps(&(OBJVERTPTR(objVert2)[-1]));        \
    transpose[3].m128 = _mm_loadu_ps(&(OBJVERTPTR(objVert3)[-1]));        \
}                                                                         \
MACRO_STOP

/****************************************************************************
 _rwSSETransformGetVertexUV

 On entry   : Instanced data
 On exit    : None
 */

/*
 * Assumes that
 *    RxObjSpace3DVertex
 *    RxCamSpace3DVertex
 *    RxScrSpace2DVertex/RwD3DTLVERTEX
 * texture coordinate U,V pair  are
 * 1) contiguous
 * 2) Rt_m64 aligned
 * Is this is currently the case -- see
 *    rwsdk/src/pipe/p2/d3d/pip2model.h
 *    rwsdk/src/pipe/p2/p2stdcls.h
 *    rwsdk/src/pipe/p2/d3d/pip2model.h + rwsdk/driver/d3d/drvmodel.h
 * respectively
 */

#define SSETransformGetVertexUV(objVert0, objVert1,                       \
                                objVert2, objVert3,                       \
                                uv)                                       \
MACRO_START                                                               \
{                                                                         \
    uv[0].m128 =                                                          \
        _mm_loadh_pi(_mm_loadl_pi(uv[0].m128,                             \
                                  (Rt_m64 *) OBJUVPAIRPTR(objVert0)),     \
                     (Rt_m64 *) OBJUVPAIRPTR(objVert1));                  \
    uv[1].m128 =                                                          \
        _mm_loadh_pi(_mm_loadl_pi(uv[1].m128,                             \
                                  (Rt_m64 *) OBJUVPAIRPTR(objVert2)),     \
                     (Rt_m64 *) OBJUVPAIRPTR(objVert3));                  \
   /*                                                                     \
    * for debug/testing                                                   \
    * uv[0].m128 = _mm_xor_ps(uv[0].m128, uv[0].m128);                    \
    * uv[1].m128 = _mm_xor_ps(uv[1].m128, uv[1].m128);                    \
    */                                                                    \
                                                                          \
}                                                                         \
MACRO_STOP

/****************************************************************************
 _rwSSETransformSetVertexCol

 On entry   : Instanced data
 On exit    : None
 */

#define R255 (1.0f / 255.f)

#define SSETransformSetVertexColour(objVerts, camVerts, devVerts)         \
MACRO_START                                                               \
{                                                                         \
    static const RpSSEOverlayM128 _mm_r255 =                              \
        { { R255, R255, R255, R255 } } ;                                  \
    RpSSEOverlayM128      realCol;                                        \
    RwRGBA              Col;                                              \
                                                                          \
    RxObjSpace3DVertexGetColor(((RxObjSpace3DVertex *) (objVerts)),       \
                                 &Col);                                   \
                                                                          \
    realCol._f[0] = (float) Col.red;                                      \
    realCol._f[1] = (float) Col.green;                                    \
    realCol._f[2] = (float) Col.blue;                                     \
    realCol._f[3] = (float) Col.alpha;                                    \
                                                                          \
    _mm_storeu_ps(&(((RxCamSpace3DVertex *) (camVerts))->col.red),        \
                  _mm_mul_ps(realCol.m128, _mm_r255.m128));               \
                                                                          \
    RwIm2DVertexSetIntRGBA(((RxScrSpace2DVertex *) (devVerts)),           \
                           Col.red, Col.green, Col.blue,                  \
                           Col.alpha);                                    \
                                                                          \
}                                                                         \
MACRO_STOP

/****************************************************************************
 _rwPipeTransformPerspectiveSSE

 On entry   : Instanced data
 On exit    : None
 */

#define SSEPerspectiveXformSetVertexRecip(_camVerts,                      \
                                          _devVerts,                      \
                                          _bVertRxUVs,                    \
                                          _i,                             \
                                          _uv,                            \
                                          _MM_STORE_PI,                   \
                                          _out,                           \
                                          _nRecipZ)                       \
MACRO_START                                                               \
{                                                                         \
    /* Set the recip */                                                   \
    if (0 == ((RxCamSpace3DVertex *) (_camVerts))->clipFlags)             \
    {                                                                     \
        RwIm2DVertexSetCameraX(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.x);            \
        RwIm2DVertexSetCameraY(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.y);            \
        RwIm2DVertexSetCameraZ(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.z);            \
        RwIm2DVertexSetRecipCameraZ(((RxScrSpace2DVertex *)               \
                                     (_devVerts)),                        \
                                    (_nRecipZ)->_f[(_i)]);                \
                                                                          \
        RwIm2DVertexSetScreenX(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[0]._f[(_i)]);                       \
        RwIm2DVertexSetScreenY(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[1]._f[(_i)]);                       \
        RwIm2DVertexSetScreenZ(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[2]._f[(_i)]);                       \
                                                                          \
        if ((_bVertRxUVs))                                                \
        {                                                                 \
            _MM_STORE_PI((Rt_m64 *) CAMUVPAIRPTR(_camVerts),              \
                         (_uv).m128);                                     \
            _MM_STORE_PI((Rt_m64 *) DEVUVPAIRPTR(_devVerts),              \
                         (_uv).m128);                                     \
        }                                                                 \
    }                                                                     \
    else                                                                  \
    {                                                                     \
        /* If the geometry is textured,                                   \
         * copy in RxUVs now                                              \
         * while Obj/Cam/(_devVerts) are in cache */                      \
        if ((_bVertRxUVs))                                                \
        {                                                                 \
            _MM_STORE_PI((Rt_m64 *) CAMUVPAIRPTR(_camVerts),              \
                         (_uv).m128);                                     \
        }                                                                 \
    }                                                                     \
}                                                                         \
MACRO_STOP

/****************************************************************************
 _rwPipeTransformPerspectiveSSE

 On entry   : Instanced data
 On exit    : None
 */

/****************************************************************************
 _rwPipeTransformPerspectiveSSE

 On entry   : Instanced data
 On exit    : None
 */

#define SSEParallelXformSetVertexRecip(_camVerts,                         \
                                       _devVerts,                         \
                                       _bVertRxUVs,                       \
                                       _i,                                \
                                       _uv,                               \
                                       _MM_STORE_PI,                      \
                                       _out,                              \
                                       _nRecipZ)                          \
MACRO_START                                                               \
{                                                                         \
    /* Set the recip */                                                   \
    if (0 == ((RxCamSpace3DVertex *) (_camVerts))->clipFlags)             \
    {                                                                     \
        RwIm2DVertexSetCameraX(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.x);            \
        RwIm2DVertexSetCameraY(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.y);            \
        RwIm2DVertexSetCameraZ(((RxScrSpace2DVertex *) (_devVerts)),      \
                               ((RxCamSpace3DVertex *)                    \
                                (_camVerts))->cameraVertex.z);            \
        RwIm2DVertexSetRecipCameraZ(((RxScrSpace2DVertex *)               \
                                     (_devVerts)),                        \
                                    (RwReal) 1.0);                        \
                                                                          \
        RwIm2DVertexSetScreenX(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[0]._f[(_i)]);                       \
        RwIm2DVertexSetScreenY(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[1]._f[(_i)]);                       \
        RwIm2DVertexSetScreenZ(((RxScrSpace2DVertex *) (_devVerts)),      \
                               (_out)[2]._f[(_i)]);                       \
        if ((_bVertRxUVs))                                                \
        {                                                                 \
            _MM_STORE_PI((Rt_m64 *) CAMUVPAIRPTR(_camVerts),              \
                         (_uv).m128);                                     \
            _MM_STORE_PI((Rt_m64 *) DEVUVPAIRPTR(_devVerts),              \
                         (_uv).m128);                                     \
        }                                                                 \
    }                                                                     \
    else                                                                  \
    {                                                                     \
        /* If the geometry is textured,                                   \
         * copy in RxUVs now                                              \
         * while Obj/Cam/(_devVerts) are in cache */                      \
        if ((_bVertRxUVs))                                                \
        {                                                                 \
            _MM_STORE_PI((Rt_m64 *) CAMUVPAIRPTR(_camVerts),              \
                         (_uv).m128);                                     \
        }                                                                 \
    }                                                                     \
}                                                                         \
MACRO_STOP

/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

   functions

   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 */

/*****************************************************************************
 TransformNode

 Generic (non-device-specific) transform Node.

 on entry: -
 on exit : -
*/

RwBool
_rpSSETransformNodeParallel(RxPipelineNodeInstance * self,
                            RwCamera * camera)
{

    RwBool              result = TRUE;
    RxPacket           *packet;
    RwRaster           *rpRas;
    RwInt32             camHeight;
    RwInt32             camWidth;

    RWFUNCTION(RWSTRING("_rpSSETransformNodeParallel"));

    rpRas = RwCameraGetRaster(camera);

    /*
     * Set up oft-used clipping numeros
     */
    camHeight = RwRasterGetHeight(rpRas);
    gCamInfo.camHeight = (RwReal) camHeight;
    gCamInfo.camOffsetX = camera->viewOffset.x;
    gCamInfo.camOffsetY = camera->viewOffset.y;
    camWidth = RwRasterGetWidth(rpRas);
    gCamInfo.camWidth = (RwReal) camWidth;
    gCamInfo.farClip = camera->farPlane;
    gCamInfo.nearClip = camera->nearPlane;
    gCamInfo.zScale = camera->zScale;
    gCamInfo.zShift = camera->zShift;

    for (packet = RxPacketFetch(self);
         NULL != packet; packet = RxPacketFetch(self))
    {

        static const RpSSEOverlayM128 _mm_zero =
            { {0.0f, 0.0f, 0.0f, 0.0f} };
        static const RpSSEOverlayM128 _mm_one =
            { {1.0f, 1.0f, 1.0f, 1.0f} };
        RwInt32             numVerts, i;
        RwMatrix           *matrix;
        RpSSEOverlayM128    uv[2];
        RxCluster          *objVerts, *camVerts, *devVerts, *meshState;
        RwBool              bVertColours;
        RwBool              bVertRxUVs;
        RxMeshStateVector  *meshData;
        RwChar             *objVert[4], *nextObjVert[4];
        RwChar             *camVert[4], *nextCamVert[4];
        RwChar             *devVert[4], *nextDevVert[4];
        RwInt32             objStride, camStride, devStride;
        RwSplitBits         split;
        RpSSEOverlayM128    v1, v2, v3;
        RpSSEOverlayM128    in[3], out[3];
        RpSSEOverlayM128    matRight[3], matUp[3], matAt[3], matPos[3];
        RpSSEOverlayM128    transpose[4], row[4];
        RpSSEOverlayM128    camWidth, camHeight;
        RpSSEOverlayM128    camOffsetX, camOffsetY;
        RpSSEOverlayM128    zShift, zScale;
        RpSSEOverlayM128    nearClip, farClip;
        RpSSEOverlayM128    xLoClip, xHiClip;
        RpSSEOverlayM128    yLoClip, yHiClip;
        RpSSEOverlayM128    zLoClip, zHiClip;
        RpSSEOverlayM128    xClip, yClip, zClip;
        RpSSEOverlayM128    clipFlagsOr, clipFlagsAnd;

#if (0)
        RpSSEOverlayM128    nRecipZ;
        RwRGBA              colour;
#endif /* (0) */

        objVerts = RxClusterLockRead(packet, 0);
        camVerts = RxClusterLockWrite(packet, 1, self);
        devVerts = RxClusterLockWrite(packet, 2, self);
        meshState = RxClusterLockWrite(packet, 3, self);
        meshData = RxClusterGetCursorData(meshState, RxMeshStateVector);

        if (meshData->NumVertices == 0)
        {
            /* Kill off this empty packet here */
            RxPacketDestroy(packet, self);
            continue;          /* Not actually an error... is it? */
        }

        /* Create new space for the Camera and Device vertices
         * (throwing away any old data if there was any) */
        RxClusterInitializeData(camVerts, meshData->NumVertices,
                                sizeof(RxCamSpace3DVertex));
        RxClusterInitializeData(devVerts, meshData->NumVertices,
                                sizeof(RxScrSpace2DVertex));

        if (!RxClusterGetCursorData(camVerts, RxCamSpace3DVertex)
            || !RxClusterGetCursorData(devVerts, RxScrSpace2DVertex))
        {
            RxPacketDestroy(packet, self);
            result = FALSE;    /* B'arf! */
            break;
        }

        matrix = &(meshData->Obj2Cam);
        numVerts = meshData->NumVertices;

        bVertColours = (meshData->Flags &
                        /* bageomet.h:    rpGEOMETRYCOLORED       */
                        0x02);

        bVertRxUVs = (meshData->Flags &
                      /* bageomet.h:    rpGEOMETRYTEXTURED      */
                      0x04);

        /*
         * Setup the clip flags and view mat constances.
         */
        clipFlagsOr.m128 = _mm_zero.m128;
        clipFlagsAnd.m128 = _mm_cmpeq_ps(_mm_zero.m128, _mm_zero.m128);

        _rwSSEMATRIXLOAD(matRight, matUp, matAt, matPos, matrix);
        _rwSSECAMERALOAD(camWidth, camHeight,
                         camOffsetX, camOffsetY, zScale, zShift,
                         gCamInfo);
        _rwSSECLIPLOAD(nearClip, farClip, xLoClip, xHiClip, yLoClip,
                       yHiClip, zLoClip, zHiClip, gCamInfo);

        /*
         * Get first four verts.
         * We will duplicate if necessary so the main loop will have
         * exactly div by 4 verts.
         */
        objStride = objVerts->stride;
        camStride = camVerts->stride;
        devStride = devVerts->stride;

        objVert[0] = RxClusterGetCursorData(objVerts, RwChar);
        devVert[0] = RxClusterGetCursorData(devVerts, RwChar);
        camVert[0] = RxClusterGetCursorData(camVerts, RwChar);

        objVert[1] = objVert[0];
        devVert[1] = devVert[0];
        camVert[1] = camVert[0];

        objVert[2] = objVert[0];
        devVert[2] = devVert[0];
        camVert[2] = camVert[0];

        objVert[3] = objVert[0];
        devVert[3] = devVert[0];
        camVert[3] = camVert[0];

        /*
         * N.b. Deliberate fall through -- no breaks.
         * C.f. Duff's Device
         * http://www.lysator.liu.se/c/duffs-device.html
         */

        switch (numVerts & 3)
        {
            case 0:
                objVert[1] = objVert[0] + objStride;
                camVert[1] = camVert[0] + camStride;
                devVert[1] = devVert[0] + devStride;

            case 3:
                objVert[2] = objVert[1] + objStride;
                camVert[2] = camVert[1] + camStride;
                devVert[2] = devVert[1] + devStride;

            case 2:
                objVert[3] = objVert[2] + objStride;
                camVert[3] = camVert[2] + camStride;
                devVert[3] = devVert[2] + devStride;

            default:
                ;
        }

        /* Parallel Projection */
        numVerts = (numVerts + 3) >> 2;
        while (--numVerts >= 0)
        {

            /* 
             * For prefetch example see
             * Intel/iatraining/Samples/w_dp3dtrans/DPTRANSFORM.cpp
             * _mm_prefetch(vertex+i*4+16, _MM_HINT_NTA);
             */
            nextObjVert[0] = objVert[3] + objStride;
            _mm_prefetch(nextObjVert[0], _MM_HINT_NTA);
            nextObjVert[1] = nextObjVert[0] + objStride;
            _mm_prefetch(nextObjVert[1], _MM_HINT_NTA);
            nextObjVert[2] = nextObjVert[1] + objStride;
            _mm_prefetch(nextObjVert[2], _MM_HINT_NTA);
            nextObjVert[3] = nextObjVert[2] + objStride;
            _mm_prefetch(nextObjVert[3], _MM_HINT_NTA);

            nextCamVert[0] = camVert[3] + camStride;
            _mm_prefetch(nextCamVert[0], _MM_HINT_NTA);
            nextCamVert[1] = nextCamVert[0] + camStride;
            _mm_prefetch(nextCamVert[1], _MM_HINT_NTA);
            nextCamVert[2] = nextCamVert[1] + camStride;
            _mm_prefetch(nextCamVert[2], _MM_HINT_NTA);
            nextCamVert[3] = nextCamVert[2] + camStride;
            _mm_prefetch(nextCamVert[3], _MM_HINT_NTA);

            nextDevVert[0] = devVert[3] + devStride;
            _mm_prefetch(nextDevVert[0], _MM_HINT_NTA);
            nextDevVert[1] = nextDevVert[0] + devStride;
            _mm_prefetch(nextDevVert[1], _MM_HINT_NTA);
            nextDevVert[2] = nextDevVert[1] + devStride;
            _mm_prefetch(nextDevVert[2], _MM_HINT_NTA);
            nextDevVert[3] = nextDevVert[2] + devStride;
            _mm_prefetch(nextDevVert[3], _MM_HINT_NTA);

            /* Load the four X, Y + Z */

            SSETransformSetSSEOverlayM128(objVert[0], objVert[1],
                                          objVert[2], objVert[3],
                                          transpose);

            if (bVertColours)
            {
                SSETransformSetVertexColour(objVert[0],
                                            camVert[0], devVert[0]);
                SSETransformSetVertexColour(objVert[1],
                                            camVert[1], devVert[1]);
                SSETransformSetVertexColour(objVert[2],
                                            camVert[2], devVert[2]);
                SSETransformSetVertexColour(objVert[3],
                                            camVert[3], devVert[3]);
            }

            if (bVertRxUVs)
            {
                SSETransformGetVertexUV(objVert[0], objVert[1],
                                        objVert[2], objVert[3], uv);
            }

            /*
             * _mm_shuffle_ps(Rt_m128 a ,  Rt_m128 b , int i )
             *
             * Selects four specific SP FP values from a and b, 
             * based on the mask i.
             * The mask must be an immediate
             *
             * See also icl _MM_TRANSPOSE4_PS macro
             */

            row[0].m128 =
                _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                               0x44);
            row[2].m128 =
                _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                               0xEE);
            row[1].m128 =
                _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                               0x44);
            row[3].m128 =
                _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                               0xEE);

            (in[0].m128) =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
            (in[1].m128) =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
            (in[2].m128) =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

            /* Calc the X. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[0].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[0].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[0].m128);
            out[0].m128 =
                _mm_add_ps(_mm_add_ps
                           (v1.m128,
                            v2.m128), _mm_add_ps(v3.m128,
                                                 matPos[0].m128));

            /* Calc the Y. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[1].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[1].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[1].m128);
            out[1].m128 =
                _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                           _mm_add_ps(v3.m128, matPos[1].m128));

            /* Calc the Z. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[2].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[2].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[2].m128);
            out[2].m128 =
                _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                           _mm_add_ps(v3.m128, matPos[2].m128));

            /* Save the results. */

            row[0].m128 =
                _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0x44);
            row[2].m128 =
                _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0xEE);
            row[1].m128 =
                _mm_shuffle_ps((out[1].m128), (out[2].m128), 0x44);
            row[3].m128 =
                _mm_shuffle_ps((out[1].m128), (out[2].m128), 0xEE);

            transpose[0].m128 =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0x88);
            transpose[1].m128 =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
            transpose[2].m128 =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
            transpose[3].m128 =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

            ((RxCamSpace3DVertex *) (camVert[0]))->cameraVertex =
                transpose[0].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[1]))->cameraVertex =
                transpose[1].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[2]))->cameraVertex =
                transpose[2].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[3]))->cameraVertex =
                transpose[3].v4d.v3d;
            /* No fields overloaded yet */

            /*
             * Only do the projection and store a depth buffer value
             * for vertices inside the view volume
             * 3 D clipped vertices will have to wait until later...
             */

            /* Check the X clip. */
            v1.m128 = _mm_cmpgt_ps(out[0].m128, _mm_one.m128);
            v1.m128 = _mm_and_ps(v1.m128, xHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[0].m128, _mm_zero.m128);
            v3.m128 = _mm_and_ps(v2.m128, xLoClip.m128);

            xClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            xClip.m128 = _mm_or_ps(v3.m128, xClip.m128);

            /* Check the Y clip. */
            v1.m128 = _mm_cmpgt_ps(out[1].m128, _mm_one.m128);
            v1.m128 = _mm_and_ps(v1.m128, yHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[1].m128, _mm_zero.m128);
            v3.m128 = _mm_and_ps(v2.m128, yLoClip.m128);

            yClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            yClip.m128 = _mm_or_ps(v3.m128, yClip.m128);

            /* Check the Z clip. */
            v1.m128 = _mm_cmpgt_ps(out[2].m128, farClip.m128);
            v1.m128 = _mm_and_ps(v1.m128, zHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[2].m128, nearClip.m128);
            v3.m128 = _mm_and_ps(v2.m128, zLoClip.m128);

            zClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            zClip.m128 = _mm_or_ps(v3.m128, zClip.m128);

            /* Combine the clip flags. */
            zClip.m128 = _mm_or_ps(zClip.m128, xClip.m128);
            zClip.m128 = _mm_or_ps(zClip.m128, yClip.m128);

            /* Store the clip flags. */
            split.nReal = zClip._f[0];
            ((RxCamSpace3DVertex *) (camVert[0]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[1];
            ((RxCamSpace3DVertex *) (camVert[1]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[2];
            ((RxCamSpace3DVertex *) (camVert[2]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[3];
            ((RxCamSpace3DVertex *) (camVert[3]))->clipFlags =
                (RwUInt8) split.nUInt;

            clipFlagsOr.m128 = _mm_or_ps(clipFlagsOr.m128, zClip.m128);
            clipFlagsAnd.m128 =
                _mm_and_ps(clipFlagsAnd.m128, zClip.m128);

            out[0].m128 = _mm_mul_ps(out[0].m128, camWidth.m128);
            out[0].m128 = _mm_add_ps(out[0].m128, camOffsetX.m128);

            out[1].m128 = _mm_mul_ps(out[1].m128, camHeight.m128);
            out[1].m128 = _mm_add_ps(out[1].m128, camOffsetY.m128);

            out[2].m128 = _mm_mul_ps(out[2].m128, zScale.m128);
            out[2].m128 = _mm_add_ps(out[2].m128, zShift.m128);

            /* Set the recip */
            SSEParallelXformSetVertexRecip(camVert[0], devVert[0],
                                           bVertRxUVs, 0,
                                           uv[0], _mm_storel_pi,
                                           out, &nRecipZ);

            SSEParallelXformSetVertexRecip(camVert[1], devVert[1],
                                           bVertRxUVs, 1,
                                           uv[0], _mm_storeh_pi,
                                           out, &nRecipZ);

            SSEParallelXformSetVertexRecip(camVert[2], devVert[2],
                                           bVertRxUVs, 2,
                                           uv[1], _mm_storel_pi,
                                           out, &nRecipZ);

            SSEParallelXformSetVertexRecip(camVert[3], devVert[3],
                                           bVertRxUVs, 3,
                                           uv[1], _mm_storeh_pi,
                                           out, &nRecipZ);

            /* Onto the next 4 vertex */

            objVert[0] = nextObjVert[0];
            objVert[1] = nextObjVert[1];
            objVert[2] = nextObjVert[2];
            objVert[3] = nextObjVert[3];

            camVert[0] = nextCamVert[0];
            camVert[1] = nextCamVert[1];
            camVert[2] = nextCamVert[2];
            camVert[3] = nextCamVert[3];

            devVert[0] = nextDevVert[0];
            devVert[1] = nextDevVert[1];
            devVert[2] = nextDevVert[2];
            devVert[3] = nextDevVert[3];

        }

        devVerts->numUsed = meshData->NumVertices;
        camVerts->numUsed = meshData->NumVertices;

        /* If clipFlagsOr   = 0, everything is on  the screen,
         * If clipFlagsAnd != 0, everything is off the screen.
         */
        i = 0;
        split.nReal = clipFlagsOr._f[0];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[1];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[2];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[3];
        i |= split.nUInt;
        meshData->ClipFlagsOr |= i;

        i = -1;
        split.nReal = clipFlagsAnd._f[0];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[1];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[2];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[3];
        i &= split.nUInt;
        meshData->ClipFlagsAnd &= i;

#if (0)
        /*
         *If we're not overloading stuff, copy it across now from the instanced
         * vertices to the camera and potentially the device vertices
         */
        if (!StateData->ClipFlagsAnd)
        {
            _rwSetNonOverloadedFieldsInCamAndDevVertexFn func;

            func =
                _rwPipeState.
                currentContext->fpSetNonOverloadedFieldsInCamAndDevVert;
            func(repEntry);
        }
#endif /* (0) */

        /* Output the packet to the first output of this Node... 
         * ...unless the vertices are entirely offscreen, in which case
         * send the packet to the second output (usually to be destroyed) */
        if (i == 0)
        {
            RxPacketDispatch(packet, 0, self);
        }
        else
        {
            RxPacketDispatch(packet, 1, self);
        }

        /* RWCRTCHECKMEMORY(); */

    }

    RWRETURN(result);

}

RwBool
_rpSSETransformNodePerspective(RxPipelineNodeInstance * self,
                               RwCamera * camera)
{
    RwBool              result = TRUE;
    RxPacket           *packet;
    RwRaster           *rpRas;
    RwInt32             camHeight;
    RwInt32             camWidth;

    RWFUNCTION(RWSTRING("_rpSSETransformNodePerspective"));

    rpRas = RwCameraGetRaster(camera);

    /*
     * Set up oft-used clipping numeros
     */
    camHeight = RwRasterGetHeight(rpRas);
    gCamInfo.camHeight = (RwReal) camHeight;
    gCamInfo.camOffsetX = camera->viewOffset.x;
    gCamInfo.camOffsetY = camera->viewOffset.y;
    camWidth = RwRasterGetWidth(rpRas);
    gCamInfo.camWidth = (RwReal) camWidth;
    gCamInfo.farClip = camera->farPlane;
    gCamInfo.nearClip = camera->nearPlane;
    gCamInfo.zScale = camera->zScale;
    gCamInfo.zShift = camera->zShift;

    for (packet = RxPacketFetch(self);
         NULL != packet; packet = RxPacketFetch(self))
    {
        static const RpSSEOverlayM128 _mm_zero =
            { {0.0f, 0.0f, 0.0f, 0.0f} };
        static const RpSSEOverlayM128 _mm_one =
            { {1.0f, 1.0f, 1.0f, 1.0f} };
        RwInt32             numVerts, i;
        RwMatrix           *matrix;
        RpSSEOverlayM128    uv[2];
        RxCluster          *objVerts, *camVerts, *devVerts, *meshState;
        RwBool              bVertColours;
        RwBool              bVertRxUVs;
        RxMeshStateVector  *meshData;
        RwChar             *objVert[4], *nextObjVert[4];
        RwChar             *camVert[4], *nextCamVert[4];
        RwChar             *devVert[4], *nextDevVert[4];
        RwInt32             objStride, camStride, devStride;
        RwSplitBits         split;
        RpSSEOverlayM128    v1, v2, v3;
        RpSSEOverlayM128    in[3], out[3], nRecipZ;
        RpSSEOverlayM128    matRight[3], matUp[3], matAt[3], matPos[3];
        RpSSEOverlayM128    transpose[4], row[4];
        RpSSEOverlayM128    camWidth, camHeight;
        RpSSEOverlayM128    camOffsetX, camOffsetY;
        RpSSEOverlayM128    zShift, zScale;
        RpSSEOverlayM128    nearClip, farClip;
        RpSSEOverlayM128    xLoClip, xHiClip;
        RpSSEOverlayM128    yLoClip, yHiClip;
        RpSSEOverlayM128    zLoClip, zHiClip;
        RpSSEOverlayM128    xClip, yClip, zClip;
        RpSSEOverlayM128    clipFlagsOr, clipFlagsAnd;

#if (0)
        RwInt32             j;
#endif /* (0) */

        objVerts = RxClusterLockRead(packet, 0);
        camVerts = RxClusterLockWrite(packet, 1, self);
        devVerts = RxClusterLockWrite(packet, 2, self);
        meshState = RxClusterLockWrite(packet, 3, self);

        meshData = RxClusterGetCursorData(meshState, RxMeshStateVector);
        if (meshData->NumVertices == 0)
        {
            /* Kill off this empty packet here */
            RxPacketDestroy(packet, self);
            continue;          /* Not actually an error... is it? */

        }

        /* Create new space for the Camera and Device vertices
         * (throwing away any old data if there was any) */
        RxClusterInitializeData(camVerts, meshData->NumVertices,
                                sizeof(RxCamSpace3DVertex));
        RxClusterInitializeData(devVerts, meshData->NumVertices,
                                sizeof(RxScrSpace2DVertex));

        if (!RxClusterGetCursorData(camVerts, RxCamSpace3DVertex) ||
            !RxClusterGetCursorData(devVerts, RxScrSpace2DVertex))
        {
            RxPacketDestroy(packet, self);
            result = FALSE;    /* B'arf! */
            break;
        }

        matrix = &(meshData->Obj2Cam);
        numVerts = meshData->NumVertices;

        bVertColours = (meshData->Flags &
                        /* bageomet.h:    rpGEOMETRYCOLORED       */
                        0x02);

        bVertRxUVs = (meshData->Flags &
                      /* bageomet.h:    rpGEOMETRYTEXTURED      */
                      0x04);

        /*
         * Setup the clip flags and view mat constances.
         */
        clipFlagsOr.m128 = _mm_zero.m128;
        clipFlagsAnd.m128 = _mm_cmpeq_ps(_mm_zero.m128, _mm_zero.m128);

        _rwSSEMATRIXLOAD(matRight, matUp, matAt, matPos, matrix);
        _rwSSECAMERALOAD(camWidth, camHeight,
                         camOffsetX, camOffsetY, zScale, zShift,
                         gCamInfo);
        _rwSSECLIPLOAD(nearClip, farClip, xLoClip, xHiClip, yLoClip,
                       yHiClip, zLoClip, zHiClip, gCamInfo);

        /*
         * Get first four verts.
         * We will duplicate if necessary so the main loop will have
         * exactly mult of 4 verts.
         */
        objStride = objVerts->stride;
        camStride = camVerts->stride;
        devStride = devVerts->stride;

        objVert[0] = RxClusterGetCursorData(objVerts, RwChar);
        devVert[0] = RxClusterGetCursorData(devVerts, RwChar);
        camVert[0] = RxClusterGetCursorData(camVerts, RwChar);

        objVert[1] = objVert[0];
        devVert[1] = devVert[0];
        camVert[1] = camVert[0];

        objVert[2] = objVert[0];
        devVert[2] = devVert[0];
        camVert[2] = camVert[0];

        objVert[3] = objVert[0];
        devVert[3] = devVert[0];
        camVert[3] = camVert[0];

        /*
         * N.b. Deliberate fall through -- no breaks.
         * C.f. Duff's Device
         * http://www.lysator.liu.se/c/duffs-device.html
         */

        switch (numVerts & 3)
        {
            case 0:
                objVert[1] = objVert[0] + objStride;
                camVert[1] = camVert[0] + camStride;
                devVert[1] = devVert[0] + devStride;

            case 3:
                objVert[2] = objVert[1] + objStride;
                camVert[2] = camVert[1] + camStride;
                devVert[2] = devVert[1] + devStride;

            case 2:
                objVert[3] = objVert[2] + objStride;
                camVert[3] = camVert[2] + camStride;
                devVert[3] = devVert[2] + devStride;

            default:
                ;
        }

        /* Perspective Projection */
        numVerts = (numVerts + 3) >> 2;

        while (--numVerts >= 0)
        {
            /* 
             * For prefetch example see
             * Intel/iatraining/Samples/w_dp3dtrans/DPTRANSFORM.cpp
             * _mm_prefetch(vertex+i*4+16, _MM_HINT_NTA);
             */
            nextObjVert[0] = objVert[3] + objStride;
            _mm_prefetch(nextObjVert[0], _MM_HINT_NTA);
            nextObjVert[1] = nextObjVert[0] + objStride;
            _mm_prefetch(nextObjVert[1], _MM_HINT_NTA);
            nextObjVert[2] = nextObjVert[1] + objStride;
            _mm_prefetch(nextObjVert[2], _MM_HINT_NTA);
            nextObjVert[3] = nextObjVert[2] + objStride;
            _mm_prefetch(nextObjVert[3], _MM_HINT_NTA);

            nextCamVert[0] = camVert[3] + camStride;
            _mm_prefetch(nextCamVert[0], _MM_HINT_NTA);
            nextCamVert[1] = nextCamVert[0] + camStride;
            _mm_prefetch(nextCamVert[1], _MM_HINT_NTA);
            nextCamVert[2] = nextCamVert[1] + camStride;
            _mm_prefetch(nextCamVert[2], _MM_HINT_NTA);
            nextCamVert[3] = nextCamVert[2] + camStride;
            _mm_prefetch(nextCamVert[3], _MM_HINT_NTA);

            nextDevVert[0] = devVert[3] + devStride;
            _mm_prefetch(nextDevVert[0], _MM_HINT_NTA);
            nextDevVert[1] = nextDevVert[0] + devStride;
            _mm_prefetch(nextDevVert[1], _MM_HINT_NTA);
            nextDevVert[2] = nextDevVert[1] + devStride;
            _mm_prefetch(nextDevVert[2], _MM_HINT_NTA);
            nextDevVert[3] = nextDevVert[2] + devStride;
            _mm_prefetch(nextDevVert[3], _MM_HINT_NTA);

            /* Load the four X, Y + Z */

            SSETransformSetSSEOverlayM128(objVert[0], objVert[1],
                                          objVert[2], objVert[3],
                                          transpose);

            if (bVertColours)
            {
                SSETransformSetVertexColour(objVert[0],
                                            camVert[0], devVert[0]);
                SSETransformSetVertexColour(objVert[1],
                                            camVert[1], devVert[1]);
                SSETransformSetVertexColour(objVert[2],
                                            camVert[2], devVert[2]);
                SSETransformSetVertexColour(objVert[3],
                                            camVert[3], devVert[3]);
            }

            if (bVertRxUVs)
            {
                SSETransformGetVertexUV(objVert[0], objVert[1],
                                        objVert[2], objVert[3], uv);
            }

            /*
             * _mm_shuffle_ps(Rt_m128 a ,  Rt_m128 b , int i )
             *
             * Selects four specific SP FP values from a and b, 
             * based on the mask i
             * The mask must be an immediate
             *
             * See also icl _MM_TRANSPOSE4_PS macro
             */

            row[0].m128 =
                _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                               0x44);

            row[2].m128 =
                _mm_shuffle_ps((transpose[0].m128), (transpose[1].m128),
                               0xEE);

            row[1].m128 =
                _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                               0x44);

            row[3].m128 =
                _mm_shuffle_ps((transpose[2].m128), (transpose[3].m128),
                               0xEE);

            (in[0].m128) =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
            (in[1].m128) =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
            (in[2].m128) =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

            /* Calc the new pos X the four verts. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[0].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[0].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[0].m128);
            out[0].m128 =
                _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                           _mm_add_ps(v3.m128, matPos[0].m128));

            /* Calc the new pos Y the four verts. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[1].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[1].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[1].m128);
            out[1].m128 =
                _mm_add_ps(_mm_add_ps(v1.m128, v2.m128),
                           _mm_add_ps(v3.m128, matPos[1].m128));

            /* Calc the new pos Z the four verts. */
            v1.m128 = _mm_mul_ps(in[0].m128, matRight[2].m128);
            v2.m128 = _mm_mul_ps(in[1].m128, matUp[2].m128);
            v3.m128 = _mm_mul_ps(in[2].m128, matAt[2].m128);
            out[2].m128 =
                _mm_add_ps(_mm_add_ps
                           (v1.m128, v2.m128),
                           _mm_add_ps(v3.m128, matPos[2].m128));

            /* Save the results. */

            row[0].m128 =
                _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0x44);
            row[2].m128 =
                _mm_shuffle_ps((_mm_zero.m128), (out[0].m128), 0xEE);
            row[1].m128 =
                _mm_shuffle_ps((out[1].m128), (out[2].m128), 0x44);
            row[3].m128 =
                _mm_shuffle_ps((out[1].m128), (out[2].m128), 0xEE);

            transpose[0].m128 =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0x88);
            transpose[1].m128 =
                _mm_shuffle_ps(row[0].m128, row[1].m128, 0xDD);
            transpose[2].m128 =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0x88);
            transpose[3].m128 =
                _mm_shuffle_ps(row[2].m128, row[3].m128, 0xDD);

            ((RxCamSpace3DVertex *) (camVert[0]))->cameraVertex =
                transpose[0].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[1]))->cameraVertex =
                transpose[1].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[2]))->cameraVertex =
                transpose[2].v4d.v3d;
            ((RxCamSpace3DVertex *) (camVert[3]))->cameraVertex =
                transpose[3].v4d.v3d;

            /*
             * Only do the projection and store a depth buffer value
             * for vertices inside the view volume
             * 3D clipped vertices will have to wait until later ...
             * Check the X clip and  Prefech the overload flags.
             */
            v1.m128 = _mm_cmpgt_ps(out[0].m128, out[2].m128);
            v1.m128 = _mm_and_ps(v1.m128, xHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[0].m128, _mm_zero.m128);
            v3.m128 = _mm_and_ps(v2.m128, xLoClip.m128);

            xClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            xClip.m128 = _mm_or_ps(v3.m128, xClip.m128);

            /* Check the Y clip and set overload flag. */
            v1.m128 = _mm_cmpgt_ps(out[1].m128, out[2].m128);
            v1.m128 = _mm_and_ps(v1.m128, yHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[1].m128, _mm_zero.m128);
            v3.m128 = _mm_and_ps(v2.m128, yLoClip.m128);
            yClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            yClip.m128 = _mm_or_ps(v3.m128, yClip.m128);

            /* Check the Z clip and get the next four verts. */
            v1.m128 = _mm_cmpgt_ps(out[2].m128, farClip.m128);
            v1.m128 = _mm_and_ps(v1.m128, zHiClip.m128);
            v2.m128 = _mm_cmplt_ps(out[2].m128, nearClip.m128);
            v3.m128 = _mm_and_ps(v2.m128, zLoClip.m128);

            zClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
            zClip.m128 = _mm_or_ps(v3.m128, zClip.m128);

            /* Combine the clip flags. */
            zClip.m128 = _mm_or_ps(zClip.m128, xClip.m128);
            zClip.m128 = _mm_or_ps(zClip.m128, yClip.m128);

            /* Store the clip flags. */
            split.nReal = zClip._f[0];
            ((RxCamSpace3DVertex *) (camVert[0]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[1];
            ((RxCamSpace3DVertex *) (camVert[1]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[2];
            ((RxCamSpace3DVertex *) (camVert[2]))->clipFlags =
                (RwUInt8) split.nUInt;

            split.nReal = zClip._f[3];
            ((RxCamSpace3DVertex *) (camVert[3]))->clipFlags =
                (RwUInt8) split.nUInt;

            /* Prefetch inVert0. */
            clipFlagsOr.m128 = _mm_or_ps(clipFlagsOr.m128, zClip.m128);
            clipFlagsAnd.m128 =
                _mm_and_ps(clipFlagsAnd.m128, zClip.m128);

            /* Set the recip. */
            nRecipZ.m128 = _mm_cmpeq_ps(zClip.m128, _mm_zero.m128);
            nRecipZ.m128 =
                _mm_or_ps(_mm_and_ps(nRecipZ.m128, out[2].m128),
                          _mm_andnot_ps(nRecipZ.m128, _mm_one.m128));
            nRecipZ.m128 = _mm_rcp_ps(nRecipZ.m128);

            /* Calc the results and prefetch the next inputs. */
            out[0].m128 = _mm_mul_ps(out[0].m128, nRecipZ.m128);
            out[0].m128 = _mm_mul_ps(out[0].m128, camWidth.m128);
            out[0].m128 = _mm_add_ps(out[0].m128, camOffsetX.m128);
            out[1].m128 = _mm_mul_ps(out[1].m128, nRecipZ.m128);
            out[1].m128 = _mm_mul_ps(out[1].m128, camHeight.m128);
            out[1].m128 = _mm_add_ps(out[1].m128, camOffsetY.m128);
            out[2].m128 = _mm_mul_ps(zScale.m128, nRecipZ.m128);
            out[2].m128 = _mm_add_ps(out[2].m128, zShift.m128);

            /* Set the recip */
            SSEPerspectiveXformSetVertexRecip(camVert[0], devVert[0],
                                              bVertRxUVs, 0,
                                              uv[0], _mm_storel_pi,
                                              out, &nRecipZ);

            SSEPerspectiveXformSetVertexRecip(camVert[1], devVert[1],
                                              bVertRxUVs, 1,
                                              uv[0], _mm_storeh_pi,
                                              out, &nRecipZ);

            SSEPerspectiveXformSetVertexRecip(camVert[2], devVert[2],
                                              bVertRxUVs, 2,
                                              uv[1], _mm_storel_pi,
                                              out, &nRecipZ);

            SSEPerspectiveXformSetVertexRecip(camVert[3], devVert[3],
                                              bVertRxUVs, 3,
                                              uv[1], _mm_storeh_pi,
                                              out, &nRecipZ);

            /* Get the next 4 verts. */

            objVert[0] = nextObjVert[0];
            objVert[1] = nextObjVert[1];
            objVert[2] = nextObjVert[2];
            objVert[3] = nextObjVert[3];

            camVert[0] = nextCamVert[0];
            camVert[1] = nextCamVert[1];
            camVert[2] = nextCamVert[2];
            camVert[3] = nextCamVert[3];

            devVert[0] = nextDevVert[0];
            devVert[1] = nextDevVert[1];
            devVert[2] = nextDevVert[2];
            devVert[3] = nextDevVert[3];

        }

        devVerts->numUsed = meshData->NumVertices;
        camVerts->numUsed = meshData->NumVertices;

        /* If clipFlagsOr   = 0, everything is on  the screen,
         * If clipFlagsAnd != 0, everything is off the screen.
         */
        i = 0;
        split.nReal = clipFlagsOr._f[0];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[1];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[2];
        i |= split.nUInt;
        split.nReal = clipFlagsOr._f[3];
        i |= split.nUInt;
        meshData->ClipFlagsOr |= i;

        i = -1;
        split.nReal = clipFlagsAnd._f[0];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[1];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[2];
        i &= split.nUInt;
        split.nReal = clipFlagsAnd._f[3];
        i &= split.nUInt;
        meshData->ClipFlagsAnd &= i;

#if (0)
        /*
         * If we're not overloading stuff, copy it across now from 
         * the instanced vertices to the camera and potentially the 
         * device vertices
         */
        if (!StateData->ClipFlagsAnd)
        {
            _rwSetNonOverloadedFieldsInCamAndDevVertexFn func;

            func =
                _rwPipeState.
                currentContext->fpSetNonOverloadedFieldsInCamAndDevVert;
            func(repEntry);
        }
#endif /* (0) */

        /* Output the packet to the first output of this Node... 
         * ...unless the vertices are entirely offscreen, in which case
         * send the packet to the second output (usually to be destroyed) */
        if (i == 0)
        {
            RxPacketDispatch(packet, 0, self);
        }
        else
        {
            RxPacketDispatch(packet, 1, self);
        }

        /* RWCRTCHECKMEMORY(); */

    }

    RWRETURN(result);

}

#endif /* (400 <= __ICL) */
#endif /* (defined(__ICL) && defined(RWSIMD)) */
