Forum

SIMD/SSE Instructions

Discuss programming topics for any language, any source base. If it is programming related but doesn't fit in one of the below categories, it goes here.

Moderator: InsideQC Admins

SIMD/SSE Instructions

Postby jitspoe » Wed May 29, 2013 8:04 pm

Anybody here messed around with simd instructions/intrinsics? I just recently tried them for the first time and attempted to optimize some skeletal model code in Paintball2. I was able to get some pretty decent speed improvements (around 14% overall fps increase in a scenario with lots of player models), but I don't feel like I'm doing things as optimally as I could. I think I need to write a test app to just try a bunch of different functions and see how they perform. I'm having a difficult time finding reasonable information on each function. Either it's a 1 sentence, vague description, or a 20 page thesis that I don't feel like reading through.

Things I'd like to know:
How many cycles does it take to do a load and store vs a loadu and storeu? How many operations do I need to do before it justifies doing the load and store? How many cycles does it take to shuffle? Is there a better way of shuffling data around into a more vertical format?

I feel like I end up doing so much prep work to get the data into a simd-friendly format, that it negates some (or all) of the benefits of doing simd.

When I tried to use simd for lighting the model (lots of dot products), it ended up being slower.

I know most of this stuff can be done on the graphics card, but I kind of want to optimize for the (surprisingly common) case of people with fancy processors and crap video cards.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby revelator » Wed May 29, 2013 10:14 pm

Doom3 uses it extensively :) though not the SSE3 paths since it seems its slower than the non SSE code (not only noted by id several sources points to it being a fact).

Could probably get some good stuff from its idlib if you had a look.
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Re: SIMD/SSE Instructions

Postby jitspoe » Thu May 30, 2013 3:59 am

You know, I still haven't really taken a good look at the Doom3 code. I should do that one of these days.

I made a rather interesting discovery... apparently using double for local variables is faster than float. I'm trying to isolate and optimize a specific block of code. I'll just paste the whole test program (excuse the mess):

Code: Select all
#include <stdio.h>
#include <conio.h>
#include <xmmintrin.h>
#include <windows.h>


#if defined(_MSC_VER)
#define ALIGN(var) __declspec(align(16)) var
#define ATOMIC(var) __declspec(align(4)) var
#else
#define ALIGN(var) var __attribute__((__aligned__(16)))
#define ATOMIC(var) var __attribute__((__aligned__(4)))
#endif


float frand() // value between [0 and 1)
{
   return (float)rand() / (float)(RAND_MAX + 1);
}

#define NORMAL_COUNT 400

float inNormalsArray[NORMAL_COUNT][3];
float colorArray[NORMAL_COUNT][3];
double colorArrayD[NORMAL_COUNT][3];
__m128 colorArrayM[NORMAL_COUNT];

float LightAngleF[3];
float LightValF[3];
ALIGN(float LightOutAligned[NORMAL_COUNT][4]);

__m128 LightAngle;
__m128 lightdirX, lightdirY, lightdirZ;
__m128 one, ambient, halflight;

void PrepareData ()
{
   int i, j;
   float f1 = 1.0f;
   float fAmbient = 0.3f;
   float fHalfLight = (1.0f - fAmbient) * 0.5f;

   LightAngleF[0] = 0.1f;
   LightAngleF[1] = 0.2f;
   LightAngleF[2] = 0.3f; // meh, we don't really need to bother normalizing for a speed test.

   LightValF[0] = 0.6f;
   LightValF[1] = 0.7f;
   LightValF[2] = 0.8f;

   LightAngle = _mm_loadu_ps(LightAngleF);
   lightdirX = _mm_shuffle_ps(LightAngle, LightAngle, 0x00);
   lightdirY = _mm_shuffle_ps(LightAngle, LightAngle, 0x55);
   lightdirZ = _mm_shuffle_ps(LightAngle, LightAngle, 0xAA);
   one = _mm_load_ss(&f1);
   one = _mm_shuffle_ps(one, one, 0x00); // todo: init somewhere so we don't call this for each mesh.
   ambient = _mm_load_ss(&fAmbient);
   halflight = _mm_load_ss(&fHalfLight);
   ambient = _mm_shuffle_ps(ambient, ambient, 0x00);
   halflight = _mm_shuffle_ps(halflight, halflight, 0x00);


   for (i = 0; i < NORMAL_COUNT; ++i)
   {
      float length = 0;
      
      for (j = 0; j < 3; ++j)
      {
         inNormalsArray[i][j] = frand();
         // meh, we don't really need to bother normalizing for a speed test.
      }
   }
}



void TestDotsScalarFloat (int count)
{
   int i;

   for (i = 0; i < count; ++i)
   {
      float light = ((LightAngleF[0] * inNormalsArray[i][0] + LightAngleF[1] * inNormalsArray[i][1] + LightAngleF[2] * inNormalsArray[i][2]) + 1.0f) * 0.5f * 0.7f + 0.3f;
      colorArray[i][0] = LightValF[0] * light;
      colorArray[i][1] = LightValF[1] * light;
      colorArray[i][2] = LightValF[2] * light;
   }
}


void TestDotsScalarDouble (int count)
{
   int i;

   for (i = 0; i < count; ++i)
   {
      double light = ((LightAngleF[0] * inNormalsArray[i][0] + LightAngleF[1] * inNormalsArray[i][1] + LightAngleF[2] * inNormalsArray[i][2]) + 1.0) * 0.5 * 0.7 + 0.3;
      colorArray[i][0] = (float)(LightValF[0] * light);
      colorArray[i][1] = (float)(LightValF[1] * light);
      colorArray[i][2] = (float)(LightValF[2] * light);
   }
}

#if 1
void TestDotsSIMD (int count)
{
   __m128 shadelightM = _mm_loadu_ps(LightValF);
   int j;

   for (j = 0; j < count; j += 4)
   {
      // todo: align for better performance, and is there a faster way to store these in general?
      // Note: Aligning seems to have no performance impact unless the memory is actually unaligned (compiler automatically optimizes this?)
#define TEST_ALIGN
#ifdef TEST_ALIGN
      ALIGN(float normalsXf[4]) = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
      ALIGN(float normalsYf[4]) = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
      ALIGN(float normalsZf[4]) = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
      __m128 normalsXm = _mm_load_ps(normalsXf);
      __m128 normalsYm = _mm_load_ps(normalsYf);
      __m128 normalsZm = _mm_load_ps(normalsZf);
#else
      float normalsXf[5] = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
      float normalsYf[5] = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
      float normalsZf[5] = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
      __m128 normalsXm = _mm_loadu_ps(normalsXf);
      __m128 normalsYm = _mm_loadu_ps(normalsYf);
      __m128 normalsZm = _mm_loadu_ps(normalsZf);
#endif
      __m128 lightOut = _mm_add_ps(_mm_mul_ps(
         _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(normalsXm, lightdirX), _mm_mul_ps(normalsYm, lightdirY)), _mm_mul_ps(normalsZm, lightdirZ)), one),
         halflight), ambient);
      __m128 lightOut0 = _mm_shuffle_ps(lightOut, lightOut, 0x00);
      __m128 lightOut1 = _mm_shuffle_ps(lightOut, lightOut, 0x55);
      __m128 lightOut2 = _mm_shuffle_ps(lightOut, lightOut, 0xAA);
      __m128 lightOut3 = _mm_shuffle_ps(lightOut, lightOut, 0xFF);
      __m128 colorOut0 = _mm_mul_ps(lightOut0, shadelightM);
      __m128 colorOut1 = _mm_mul_ps(lightOut1, shadelightM);
      __m128 colorOut2 = _mm_mul_ps(lightOut2, shadelightM);
      __m128 colorOut3 = _mm_mul_ps(lightOut3, shadelightM);
      _mm_storeu_ps(colorArray[j + 0], colorOut0);
      _mm_storeu_ps(colorArray[j + 1], colorOut1);
      _mm_storeu_ps(colorArray[j + 2], colorOut2);
      _mm_storeu_ps(colorArray[j + 3], colorOut3);
   }
}
#endif

int main (int argc, char **argv)
{
   LARGE_INTEGER timeStart, timeEnd, queryFrequency;
   float perfToMS;
   float ms;
   PrepareData();
   QueryPerformanceFrequency(&queryFrequency);
   perfToMS = 1000.0f / queryFrequency.QuadPart;

   QueryPerformanceCounter(&timeStart);
   TestDotsScalarFloat(NORMAL_COUNT);
   QueryPerformanceCounter(&timeEnd);
   ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
   printf("scalar float : %gms\n", ms);

   QueryPerformanceCounter(&timeStart);
   TestDotsScalarDouble(NORMAL_COUNT);
   QueryPerformanceCounter(&timeEnd);
   ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
   printf("scalar double: %gms\n", ms);

   QueryPerformanceCounter(&timeStart);
   TestDotsSIMD(NORMAL_COUNT);
   QueryPerformanceCounter(&timeEnd);
   ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
   printf("SIMD         : %gms\n", ms);

   _getch();
   return 0;
}


Results:

Code: Select all
scalar float : 0.0111997ms
scalar double: 0.00703982ms
SIMD         : 0.0143996ms


The double version is 30-40% faster. Just a single floating point variable makes that much of a difference. I was trying to make the code more readable by separating the dot product into a separate variable and noticed it caused the time to be a lot slower. I switched it to a double, for kicks, and the time was about the same. Removed the second variable and made the remaining one a double and, bam, 30+% speed boost! All these years I assumed float was faster or at least on par with double. Now I feel kind of dumb. I did another test switching the arrays to doubles, but this appeared to slow it down again, so the trick seems to be: store large sets of data as floats, use doubles for local variables.

As for the simd... well, it's slower than the floating point version until I get over 9000 elements in the array.

Result with a count of 40000:
Code: Select all
scalar float : 0.363831ms
scalar double: 0.176956ms
SIMD         : 0.332792ms


I find it really strange that it takes that many loops to get a benefit from the simd version... or that the ratios really change much at all after 400 or so... odd. I'm sure my simd version can be optimized, but I'm wondering if it's even possible to compete with the double version.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby jitspoe » Thu May 30, 2013 5:18 am

Well, I feel dumb again. Looks like the double version was faster because of caching. If I swap the order, the float version is faster. In reality, they're effectively the same... I think.

What's strange, though, is that I had both versions in Paintball2, with a cvar to alternate between the two, and it seemed there was a 3% or so overall FPS boost using the double version.

Edit: Scratch that, I screwed some stuff up when I was testing, the double version is, indeed, faster, but the first run through is a little slower regardless.

With a few more runs:

Code: Select all
scalar float : 0.358391ms
scalar double: 0.178556ms
scalar float : 0.255674ms
scalar double: 0.162876ms
SIMD         : 0.327992ms
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby jitspoe » Thu May 30, 2013 5:59 pm

Another update:

Using the /fp:fast optimization makes the double and float performance the same (so this is probably something you should make sure you have enabled on your engines if you don't, since we generally care more about speed than precision).

I've managed to make the SIMD version faster than the others by changing the data structure of the normals. Instead of having an array of vec3's, I changed it to 3 aligned arrays of floats. This avoids creating the local float[4]'s, which I think was causing the biggest performance hit.

That said, though, storing the normals in that format isn't ideal for other calculations, and you couldn't pass those arrays directly to OpenGL (I don't think). So... meh... SIMD optimization is tough.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby r00k » Fri May 31, 2013 3:21 am

i found some sse code for things like square root and othe math lib replacement
if i detect sse at init will this simple asm code be faster than the normal functions?
i mean for quake1 is it much benifit
it looks like a simple cut/paste codewise
r00k
 
Posts: 1110
Joined: Sat Nov 13, 2004 10:39 pm

Re: SIMD/SSE Instructions

Postby revelator » Fri May 31, 2013 10:29 am

depends for engines like tenebrae it makes quite a difference because it calculates a lot of heavy stuff on the cpu darkplaces seems to get along with the macro math functions just fine though but its also a lot more refined than tenebrae was in its days. Quake2xp ditto since it uses glslang for the gfx intensive calculations.

Guess the only way to find out is benchmarking after changes and then compare results.
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Re: SIMD/SSE Instructions

Postby jitspoe » Fri May 31, 2013 10:02 pm

SSE is usually not a simple cut/paste. You have to set your data up so things can be run in parallel, and often the price you pay to load/store/shuffle data around is greater than the benefit you get. You may end up having to rearchitect data structures and all the code that uses them in order to see a benefit.

If you're talking about replacing a simple sqrt with the SSE version, then it's going to be slower. SSE instructions are set up to run 4 instructions at once (in the typical case of 32bit floats on __m128's). In fact, you couldn't just call a SSE function with a floating point value. You have to load it to the SIMD registers, then store it back to floating point values.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby Spike » Fri May 31, 2013 11:08 pm

aye, sse is not good for dot products (unlike 3dnow), which are pretty common in 3d engines.
supposedly this stuff is meant to be improved with the sse4 instructions.
don't underestimate random transposes. :P
but yeah, avoid copying data from x87 to sse and back. if you use a little sse on your data, use a lot instead.
and yeah, glsl is prefered for calculating dotproducts for lighting, as well as skeletal transforms and stuff.
Spike
 
Posts: 2884
Joined: Fri Nov 05, 2004 3:12 am
Location: UK

Re: SIMD/SSE Instructions

Postby revelator » Sat Jun 01, 2013 12:27 am

Still trying to wrap my noggin around multi threading, though Theres lots of implementations gcc/omp -> pthreads msvc equivalent -> winthreads Intel has there own version with even more optimization pragmas than msvc and some other Company i cant remember the name off uses a special pthreads syntax also. Compatibility is so/so Intels pragmas cannot be used by msvc without there package installed besides the
very basic commands msvc understands itself, andd the same goes for that other Company. Atm i think gcc has the upper hand when it comes to omp use unless you buy Intels compiler.

Interresting thing and a good point to support the discussion here is an example where a guy modified Doom3 to use omp/pthreads for multithreading and achived some very interresting results speedwise.
SSo yeah SSE definatly likes having a lot of data thrown at it :)
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Re: SIMD/SSE Instructions

Postby revelator » Sat Jun 01, 2013 12:30 am

Sorry for the double Words in the above post, my keyboard needs a repair (mechanical razer blackwidow with the dreaded key chattering bug).
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Re: SIMD/SSE Instructions

Postby jitspoe » Tue Jun 04, 2013 5:15 am

From what I've read, the new dot product "instructions" end up taking just as many (if not more) cycles as doing it with the old instruction sets. I'll have to dig into the 3DNow stuff. I didn't realize it was optimized for dot products. I find it kind of strange that SSE wouldn't have that from the beginning. It's kind of fundamental for vector math...

As far as multithreading goes (a bit off topic), a friend of mine was suggesting openmp. I haven't tried that. I've done a little with pthread (you can get a library so you can run it in windows). I've also used some #defines to allow me to use the same syntax for threading on different OS's.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby revelator » Tue Jun 04, 2013 5:42 am

Only brought multithreading into the discussion because off SSE thread handles :) (kinda need multithreadding to run stuff in parallel).

I Work with Mingw64/Codeblocks daily, MinGW64 has its own pthread library (winpthread) though the old one also Works fine.
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Re: SIMD/SSE Instructions

Postby jitspoe » Wed Jun 05, 2013 2:31 am

A couple more updates:

I came up with a SIMD version to do this calculation that, when the normal was loaded in the register, ended up being faster than the scalar version:

Code: Select all
void TestDotsSIMDAlt1 (int count)
{
   int j;
   __m128 one = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);

   __m128 shadelightM = _mm_loadu_ps(LightValF);
   //__m128 normal = _mm_loadu_ps(inNormalsArray[0]); // testing a scenario where we already have the normal in memory...

   for (j = 0; j < count; ++j)
   {
      __m128 normal = _mm_loadu_ps(inNormalsArray[j]);
      //__m128 normal = inNormalsArrayM[j];
      //__m128 normal = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j][1], inNormalsArray[j][2], 0.0f);
      register __m128 normmul = _mm_mul_ps(normal, LightAngle);
      register __m128 dot = _mm_add_ss(_mm_shuffle_ps(normmul, normmul, 1), _mm_add_ps(_mm_movehl_ps(normmul, normmul), normmul));
      register __m128 lightcalc = _mm_add_ss(_mm_mul_ss(_mm_add_ss(dot, one), halflight), ambient);
      register __m128 shuffled = _mm_shuffle_ps(lightcalc, lightcalc, 0);
      register __m128 out = _mm_mul_ps(shadelightM, shuffled);
      _mm_storeu_ps(colorArray[j], out);
   }
}


As it it is there, using the loadu, it's slower. If I use the one before the loop, it's 3x's faster than the non-simd version (kind of what I was hoping to get). I figured if I moved the calculations into the loop that generates the normal, I'd have it already in a register, and it would be faster...

It wasn't :( Seems no matter what I do, the non-simd version is faster.

Well, that's not entirely true. I do think I have a SIMD version that's SLIGHTLY faster:

Code: Select all
void TestDotsSIMD (int count)
{
   __m128 shadelightM = _mm_loadu_ps(LightValF);
   int j;

   for (j = 0; j < count; j += 4)
   {
      // todo: align for better performance, and is there a faster way to store these in general?
      // Note: Aligning seems to have no performance impact unless the memory is actually unaligned (compiler automatically optimizes this?)
//#define LOAD_VER2
#define LOAD_VER3
//#define LOAD_VER4
#ifdef LOAD_VER4
      __m128 normalsXm = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0]);
      __m128 normalsYm = _mm_set_ps(inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1]);
      __m128 normalsZm = _mm_set_ps(inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2]);
#else
#ifdef LOAD_VER3
      
      __m128 row0 = _mm_loadu_ps(inNormalsArray[j + 0]);
      __m128 row1 = _mm_loadu_ps(inNormalsArray[j + 1]);
      __m128 row2 = _mm_loadu_ps(inNormalsArray[j + 2]);
      __m128 row3 = _mm_loadu_ps(inNormalsArray[j + 3]);

      /*
      __m128 row0 = inNormalsArrayM[j + 0];
      __m128 row1 = inNormalsArrayM[j + 1];
      __m128 row2 = inNormalsArrayM[j + 2];
      __m128 row3 = inNormalsArrayM[j + 3];
*/
      __m128 tmp0 = _mm_shuffle_ps((row0), (row1), 0x44);
      __m128 tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE);
      __m128 tmp1 = _mm_shuffle_ps((row2), (row3), 0x44);
      __m128 tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE);
      __m128 normalsXm = _mm_shuffle_ps(tmp0, tmp1, 0x88);
      __m128 normalsYm = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
      __m128 normalsZm = _mm_shuffle_ps(tmp2, tmp3, 0x88);
      //(row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
#else
#ifdef LOAD_VER2
      /*__m128 normals0 = _mm_loadu_ps(inNormalsArray[j + 0]);
      __m128 normals1 = _mm_loadu_ps(inNormalsArray[j + 1]);
      __m128 normals2 = _mm_loadu_ps(inNormalsArray[j + 2]);
      __m128 normals3 = _mm_loadu_ps(inNormalsArray[j + 3]);*/
      __m128 normalsXm = _mm_load_ps(inNormalsArrayX + j); // pretend we have data structured differently - see how fast this is...
      __m128 normalsYm = _mm_load_ps(inNormalsArrayY + j);
      __m128 normalsZm = _mm_load_ps(inNormalsArrayZ + j);
#else
      ALIGN(float normalsXf[4]) = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
      ALIGN(float normalsYf[4]) = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
      ALIGN(float normalsZf[4]) = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
      __m128 normalsXm = _mm_load_ps(normalsXf);
      __m128 normalsYm = _mm_load_ps(normalsYf);
      __m128 normalsZm = _mm_load_ps(normalsZf);
#endif
#endif
#endif

      __m128 lightOut = _mm_add_ps(_mm_mul_ps(
         _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(normalsXm, lightdirX), _mm_mul_ps(normalsYm, lightdirY)), _mm_mul_ps(normalsZm, lightdirZ)), one),
         halflight), ambient);
      __m128 lightOut0 = _mm_shuffle_ps(lightOut, lightOut, 0x00);
      __m128 lightOut1 = _mm_shuffle_ps(lightOut, lightOut, 0x55);
      __m128 lightOut2 = _mm_shuffle_ps(lightOut, lightOut, 0xAA);
      __m128 lightOut3 = _mm_shuffle_ps(lightOut, lightOut, 0xFF);
//#define WRITE_COLOR_MM
#ifdef WRITE_COLOR_MM
      colorArrayM[j    ] = _mm_mul_ps(lightOut0, shadelightM);
      colorArrayM[j + 1] = _mm_mul_ps(lightOut1, shadelightM);
      colorArrayM[j + 2] = _mm_mul_ps(lightOut2, shadelightM);
      colorArrayM[j + 3] = _mm_mul_ps(lightOut3, shadelightM);
#else
      __m128 colorOut0 = _mm_mul_ps(lightOut0, shadelightM);
      __m128 colorOut1 = _mm_mul_ps(lightOut1, shadelightM);
      __m128 colorOut2 = _mm_mul_ps(lightOut2, shadelightM);
      __m128 colorOut3 = _mm_mul_ps(lightOut3, shadelightM);
      _mm_storeu_ps(colorArray[j + 0], colorOut0);
      _mm_storeu_ps(colorArray[j + 1], colorOut1);
      _mm_storeu_ps(colorArray[j + 2], colorOut2);
      _mm_storeu_ps(colorArray[j + 3], colorOut3);
#endif
   }
}


I haven't actually tried it in practice, yet, but in my test program it's like 5-10% faster. I'm not sure if it's worth replacing a super simple loop with a huge block of difficult to read code, though. And in the end, it might end up being slower, like everything else I've tried...

I figured it probably didn't matter too much because I was going to try using OpenGL lighting at some point, so it would be hardware accelerated on cards that supported it. I tried that -- passed the normal array in, enabled GL_LIGHTING, and didn't compute the color array. It did not, however, run faster. It may have even been slower (timings were so close, it was difficult to tell). I added a second light for kicks, and it was definitely slower in that case..

So now I'm wondering... is it actually computing the lighting on the hardware, or are the drivers doing it in software? I have an nVidia GeForce GTX 285.

I'm kind of baffled that I've spent like 2 days on this, and the basic, software for loop full of scalar operations seems to be the fastest approach.
jitspoe
 
Posts: 217
Joined: Mon Jan 17, 2005 5:27 am

Re: SIMD/SSE Instructions

Postby revelator » Wed Jun 05, 2013 4:25 am

Some code from MH i use with my own fork of Vanilla doom3.

Replaces memcpy with an asm optimized version and its way faster than anything i have tried so far :)

Code: Select all
static void *RB_MemCpy(void *dst, const void *src, size_t count)
{
   __asm
   {
      mov         esi, dword ptr [src]
      mov         edi, dword ptr [dst]

      cmp         dword ptr [count], 64
      jl         TryCopyQWord32

CopyQWord64:
      movq      mm0, [esi]
      movq      mm1, [esi + 8]
      movq      mm2, [esi + 16]
      movq      mm3, [esi + 24]
      movq      mm4, [esi + 32]
      movq      mm5, [esi + 40]
      movq      mm6, [esi + 48]
      movq      mm7, [esi + 56]
      add         esi, 64

      movntq      [edi], mm0
      movntq      [edi + 8], mm1
      movntq      [edi + 16], mm2
      movntq      [edi + 24], mm3
      movntq      [edi + 32], mm4
      movntq      [edi + 40], mm5
      movntq      [edi + 48], mm6
      movntq      [edi + 56], mm7
      add         edi, 64

      sub         dword ptr [count], 64
      cmp         dword ptr [count], 64
      jge         CopyQWord64

TryCopyQWord32:
      cmp         dword ptr [count], 32
      jl         TryCopyQWord16

CopyQWord32:
      movq      mm0, [esi]
      movq      mm1, [esi + 8]
      movq      mm2, [esi + 16]
      movq      mm3, [esi + 24]
      add         esi, 32

      movntq      [edi], mm0
      movntq      [edi + 8], mm1
      movntq      [edi + 16], mm2
      movntq      [edi + 24], mm3
      add         edi, 32

      sub         dword ptr [count], 32
      cmp         dword ptr [count], 32
      jge         CopyQWord32

TryCopyQWord16:
      cmp         dword ptr [count], 16
      jl         TryCopyQWord8

CopyQWord16:
      movq      mm0, [esi]
      movq      mm1, [esi + 8]
      add         esi, 16

      movntq      [edi], mm0
      movntq      [edi + 8], mm1
      add         edi, 16

      sub         dword ptr [count], 16
      cmp         dword ptr [count], 16
      jge         CopyQWord16

TryCopyQWord8:
      cmp         dword ptr [count], 8
      jl         TryCopyDWord

CopyQWord8:
      movq      mm0, [esi]
      add         esi, 8

      movntq      [edi], mm0
      add         edi, 8

      sub         dword ptr [count], 8
      cmp         dword ptr [count], 8
      jge         CopyQWord8

TryCopyDWord:
      cmp         dword ptr [count], 3
      jle         TryCopyWord

      mov         ecx, dword ptr [count]
      shr         ecx, 2
      mov         eax, ecx
      rep movsd

      shl         eax, 2
      sub         dword ptr [count], eax

TryCopyWord:
      cmp         dword ptr [count], 1
      jle         TryCopyByte

      movsw

      sub         dword ptr [count], 2

TryCopyByte:
      cmp         dword ptr [count], 0
      je         CopyDone

      movsb

CopyDone:
      emms
      sfence
      mov         eax, [dst]
   }
}
Productivity is a state of mind.
User avatar
revelator
 
Posts: 2548
Joined: Thu Jan 24, 2008 12:04 pm
Location: inside tha debugger

Next

Return to General Programming

Who is online

Users browsing this forum: No registered users and 1 guest