A couple more updates:
I came up with a SIMD version to do this calculation that, when the normal was loaded in the register, ended up being faster than the scalar version:
Code: Select all
void TestDotsSIMDAlt1 (int count)
{
int j;
__m128 one = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
__m128 shadelightM = _mm_loadu_ps(LightValF);
//__m128 normal = _mm_loadu_ps(inNormalsArray[0]); // testing a scenario where we already have the normal in memory...
for (j = 0; j < count; ++j)
{
__m128 normal = _mm_loadu_ps(inNormalsArray[j]);
//__m128 normal = inNormalsArrayM[j];
//__m128 normal = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j][1], inNormalsArray[j][2], 0.0f);
register __m128 normmul = _mm_mul_ps(normal, LightAngle);
register __m128 dot = _mm_add_ss(_mm_shuffle_ps(normmul, normmul, 1), _mm_add_ps(_mm_movehl_ps(normmul, normmul), normmul));
register __m128 lightcalc = _mm_add_ss(_mm_mul_ss(_mm_add_ss(dot, one), halflight), ambient);
register __m128 shuffled = _mm_shuffle_ps(lightcalc, lightcalc, 0);
register __m128 out = _mm_mul_ps(shadelightM, shuffled);
_mm_storeu_ps(colorArray[j], out);
}
}
As it it is there, using the loadu, it's slower. If I use the one before the loop, it's 3x's faster than the non-simd version (kind of what I was hoping to get). I figured if I moved the calculations into the loop that generates the normal, I'd have it already in a register, and it would be faster...
It wasn't
Seems no matter what I do, the non-simd version is faster.
Well, that's not entirely true. I do think I have a SIMD version that's SLIGHTLY faster:
Code: Select all
void TestDotsSIMD (int count)
{
__m128 shadelightM = _mm_loadu_ps(LightValF);
int j;
for (j = 0; j < count; j += 4)
{
// todo: align for better performance, and is there a faster way to store these in general?
// Note: Aligning seems to have no performance impact unless the memory is actually unaligned (compiler automatically optimizes this?)
//#define LOAD_VER2
#define LOAD_VER3
//#define LOAD_VER4
#ifdef LOAD_VER4
__m128 normalsXm = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0]);
__m128 normalsYm = _mm_set_ps(inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1]);
__m128 normalsZm = _mm_set_ps(inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2]);
#else
#ifdef LOAD_VER3
__m128 row0 = _mm_loadu_ps(inNormalsArray[j + 0]);
__m128 row1 = _mm_loadu_ps(inNormalsArray[j + 1]);
__m128 row2 = _mm_loadu_ps(inNormalsArray[j + 2]);
__m128 row3 = _mm_loadu_ps(inNormalsArray[j + 3]);
/*
__m128 row0 = inNormalsArrayM[j + 0];
__m128 row1 = inNormalsArrayM[j + 1];
__m128 row2 = inNormalsArrayM[j + 2];
__m128 row3 = inNormalsArrayM[j + 3];
*/
__m128 tmp0 = _mm_shuffle_ps((row0), (row1), 0x44);
__m128 tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE);
__m128 tmp1 = _mm_shuffle_ps((row2), (row3), 0x44);
__m128 tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE);
__m128 normalsXm = _mm_shuffle_ps(tmp0, tmp1, 0x88);
__m128 normalsYm = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
__m128 normalsZm = _mm_shuffle_ps(tmp2, tmp3, 0x88);
//(row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
#else
#ifdef LOAD_VER2
/*__m128 normals0 = _mm_loadu_ps(inNormalsArray[j + 0]);
__m128 normals1 = _mm_loadu_ps(inNormalsArray[j + 1]);
__m128 normals2 = _mm_loadu_ps(inNormalsArray[j + 2]);
__m128 normals3 = _mm_loadu_ps(inNormalsArray[j + 3]);*/
__m128 normalsXm = _mm_load_ps(inNormalsArrayX + j); // pretend we have data structured differently - see how fast this is...
__m128 normalsYm = _mm_load_ps(inNormalsArrayY + j);
__m128 normalsZm = _mm_load_ps(inNormalsArrayZ + j);
#else
ALIGN(float normalsXf[4]) = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
ALIGN(float normalsYf[4]) = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
ALIGN(float normalsZf[4]) = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
__m128 normalsXm = _mm_load_ps(normalsXf);
__m128 normalsYm = _mm_load_ps(normalsYf);
__m128 normalsZm = _mm_load_ps(normalsZf);
#endif
#endif
#endif
__m128 lightOut = _mm_add_ps(_mm_mul_ps(
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(normalsXm, lightdirX), _mm_mul_ps(normalsYm, lightdirY)), _mm_mul_ps(normalsZm, lightdirZ)), one),
halflight), ambient);
__m128 lightOut0 = _mm_shuffle_ps(lightOut, lightOut, 0x00);
__m128 lightOut1 = _mm_shuffle_ps(lightOut, lightOut, 0x55);
__m128 lightOut2 = _mm_shuffle_ps(lightOut, lightOut, 0xAA);
__m128 lightOut3 = _mm_shuffle_ps(lightOut, lightOut, 0xFF);
//#define WRITE_COLOR_MM
#ifdef WRITE_COLOR_MM
colorArrayM[j ] = _mm_mul_ps(lightOut0, shadelightM);
colorArrayM[j + 1] = _mm_mul_ps(lightOut1, shadelightM);
colorArrayM[j + 2] = _mm_mul_ps(lightOut2, shadelightM);
colorArrayM[j + 3] = _mm_mul_ps(lightOut3, shadelightM);
#else
__m128 colorOut0 = _mm_mul_ps(lightOut0, shadelightM);
__m128 colorOut1 = _mm_mul_ps(lightOut1, shadelightM);
__m128 colorOut2 = _mm_mul_ps(lightOut2, shadelightM);
__m128 colorOut3 = _mm_mul_ps(lightOut3, shadelightM);
_mm_storeu_ps(colorArray[j + 0], colorOut0);
_mm_storeu_ps(colorArray[j + 1], colorOut1);
_mm_storeu_ps(colorArray[j + 2], colorOut2);
_mm_storeu_ps(colorArray[j + 3], colorOut3);
#endif
}
}
I haven't actually tried it in practice, yet, but in my test program it's like 5-10% faster. I'm not sure if it's worth replacing a super simple loop with a huge block of difficult to read code, though. And in the end, it might end up being slower, like everything else I've tried...
I figured it probably didn't matter too much because I was going to try using OpenGL lighting at some point, so it would be hardware accelerated on cards that supported it. I tried that -- passed the normal array in, enabled GL_LIGHTING, and didn't compute the color array. It did not, however, run faster. It may have even been slower (timings were so close, it was difficult to tell). I added a second light for kicks, and it was definitely slower in that case..
So now I'm wondering... is it actually computing the lighting on the hardware, or are the drivers doing it in software? I have an nVidia GeForce GTX 285.
I'm kind of baffled that I've spent like 2 days on this, and the basic, software for loop full of scalar operations seems to be the fastest approach.