SIMD/SSE Instructions

jitspoe · Post by **jitspoe** » Wed May 29, 2013 8:04 pm

Anybody here messed around with simd instructions/intrinsics? I just recently tried them for the first time and attempted to optimize some skeletal model code in Paintball2. I was able to get some pretty decent speed improvements (around 14% overall fps increase in a scenario with lots of player models), but I don't feel like I'm doing things as optimally as I could. I think I need to write a test app to just try a bunch of different functions and see how they perform. I'm having a difficult time finding reasonable information on each function. Either it's a 1 sentence, vague description, or a 20 page thesis that I don't feel like reading through.

Things I'd like to know:
How many cycles does it take to do a load and store vs a loadu and storeu? How many operations do I need to do before it justifies doing the load and store? How many cycles does it take to shuffle? Is there a better way of shuffling data around into a more vertical format?

I feel like I end up doing so much prep work to get the data into a simd-friendly format, that it negates some (or all) of the benefits of doing simd.

When I tried to use simd for lighting the model (lots of dot products), it ended up being slower.

I know most of this stuff can be done on the graphics card, but I kind of want to optimize for the (surprisingly common) case of people with fancy processors and crap video cards.

revelator · Post by **revelator** » Wed May 29, 2013 10:14 pm

Doom3 uses it extensively

though not the SSE3 paths since it seems its slower than the non SSE code (not only noted by id several sources points to it being a fact).

Could probably get some good stuff from its idlib if you had a look.

jitspoe · Post by **jitspoe** » Thu May 30, 2013 3:59 am

You know, I still haven't really taken a good look at the Doom3 code. I should do that one of these days.

I made a rather interesting discovery... apparently using double for local variables is faster than float. I'm trying to isolate and optimize a specific block of code. I'll just paste the whole test program (excuse the mess):

Code: Select all

#include <stdio.h>
#include <conio.h>
#include <xmmintrin.h>
#include <windows.h>


#if defined(_MSC_VER)
#define ALIGN(var) __declspec(align(16)) var
#define ATOMIC(var) __declspec(align(4)) var
#else
#define ALIGN(var) var __attribute__((__aligned__(16)))
#define ATOMIC(var) var __attribute__((__aligned__(4)))
#endif


float frand() // value between [0 and 1)
{
	return (float)rand() / (float)(RAND_MAX + 1);
}

#define NORMAL_COUNT 400

float inNormalsArray[NORMAL_COUNT][3];
float colorArray[NORMAL_COUNT][3];
double colorArrayD[NORMAL_COUNT][3];
__m128 colorArrayM[NORMAL_COUNT];

float LightAngleF[3];
float LightValF[3];
ALIGN(float LightOutAligned[NORMAL_COUNT][4]);

__m128 LightAngle;
__m128 lightdirX, lightdirY, lightdirZ;
__m128 one, ambient, halflight;

void PrepareData ()
{
	int i, j;
	float f1 = 1.0f;
	float fAmbient = 0.3f;
	float fHalfLight = (1.0f - fAmbient) * 0.5f;

	LightAngleF[0] = 0.1f;
	LightAngleF[1] = 0.2f;
	LightAngleF[2] = 0.3f; // meh, we don't really need to bother normalizing for a speed test.

	LightValF[0] = 0.6f;
	LightValF[1] = 0.7f;
	LightValF[2] = 0.8f;

	LightAngle = _mm_loadu_ps(LightAngleF);
	lightdirX = _mm_shuffle_ps(LightAngle, LightAngle, 0x00);
	lightdirY = _mm_shuffle_ps(LightAngle, LightAngle, 0x55);
	lightdirZ = _mm_shuffle_ps(LightAngle, LightAngle, 0xAA);
	one = _mm_load_ss(&f1);
	one = _mm_shuffle_ps(one, one, 0x00); // todo: init somewhere so we don't call this for each mesh.
	ambient = _mm_load_ss(&fAmbient);
	halflight = _mm_load_ss(&fHalfLight);
	ambient = _mm_shuffle_ps(ambient, ambient, 0x00);
	halflight = _mm_shuffle_ps(halflight, halflight, 0x00);


	for (i = 0; i < NORMAL_COUNT; ++i)
	{
		float length = 0;
		
		for (j = 0; j < 3; ++j)
		{
			inNormalsArray[i][j] = frand();
			// meh, we don't really need to bother normalizing for a speed test.
		}
	}
}



void TestDotsScalarFloat (int count)
{
	int i;

	for (i = 0; i < count; ++i)
	{
		float light = ((LightAngleF[0] * inNormalsArray[i][0] + LightAngleF[1] * inNormalsArray[i][1] + LightAngleF[2] * inNormalsArray[i][2]) + 1.0f) * 0.5f * 0.7f + 0.3f;
		colorArray[i][0] = LightValF[0] * light;
		colorArray[i][1] = LightValF[1] * light;
		colorArray[i][2] = LightValF[2] * light;
	}
}


void TestDotsScalarDouble (int count)
{
	int i;

	for (i = 0; i < count; ++i)
	{
		double light = ((LightAngleF[0] * inNormalsArray[i][0] + LightAngleF[1] * inNormalsArray[i][1] + LightAngleF[2] * inNormalsArray[i][2]) + 1.0) * 0.5 * 0.7 + 0.3;
		colorArray[i][0] = (float)(LightValF[0] * light);
		colorArray[i][1] = (float)(LightValF[1] * light);
		colorArray[i][2] = (float)(LightValF[2] * light);
	}
}

#if 1
void TestDotsSIMD (int count)
{
	__m128 shadelightM = _mm_loadu_ps(LightValF);
	int j;

	for (j = 0; j < count; j += 4)
	{
		// todo: align for better performance, and is there a faster way to store these in general?
		// Note: Aligning seems to have no performance impact unless the memory is actually unaligned (compiler automatically optimizes this?)
#define TEST_ALIGN
#ifdef TEST_ALIGN
		ALIGN(float normalsXf[4]) = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
		ALIGN(float normalsYf[4]) = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
		ALIGN(float normalsZf[4]) = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
		__m128 normalsXm = _mm_load_ps(normalsXf);
		__m128 normalsYm = _mm_load_ps(normalsYf);
		__m128 normalsZm = _mm_load_ps(normalsZf);
#else
		float normalsXf[5] = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
		float normalsYf[5] = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
		float normalsZf[5] = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
		__m128 normalsXm = _mm_loadu_ps(normalsXf);
		__m128 normalsYm = _mm_loadu_ps(normalsYf);
		__m128 normalsZm = _mm_loadu_ps(normalsZf);
#endif
		__m128 lightOut = _mm_add_ps(_mm_mul_ps(
			_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(normalsXm, lightdirX), _mm_mul_ps(normalsYm, lightdirY)), _mm_mul_ps(normalsZm, lightdirZ)), one),
			halflight), ambient);
		__m128 lightOut0 = _mm_shuffle_ps(lightOut, lightOut, 0x00);
		__m128 lightOut1 = _mm_shuffle_ps(lightOut, lightOut, 0x55);
		__m128 lightOut2 = _mm_shuffle_ps(lightOut, lightOut, 0xAA);
		__m128 lightOut3 = _mm_shuffle_ps(lightOut, lightOut, 0xFF);
		__m128 colorOut0 = _mm_mul_ps(lightOut0, shadelightM);
		__m128 colorOut1 = _mm_mul_ps(lightOut1, shadelightM);
		__m128 colorOut2 = _mm_mul_ps(lightOut2, shadelightM);
		__m128 colorOut3 = _mm_mul_ps(lightOut3, shadelightM);
		_mm_storeu_ps(colorArray[j + 0], colorOut0);
		_mm_storeu_ps(colorArray[j + 1], colorOut1);
		_mm_storeu_ps(colorArray[j + 2], colorOut2);
		_mm_storeu_ps(colorArray[j + 3], colorOut3);
	}
}
#endif

int main (int argc, char **argv)
{
	LARGE_INTEGER timeStart, timeEnd, queryFrequency;
	float perfToMS;
	float ms;
	PrepareData();
	QueryPerformanceFrequency(&queryFrequency);
	perfToMS = 1000.0f / queryFrequency.QuadPart;

	QueryPerformanceCounter(&timeStart);
	TestDotsScalarFloat(NORMAL_COUNT);
	QueryPerformanceCounter(&timeEnd);
	ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
	printf("scalar float : %gms\n", ms);

	QueryPerformanceCounter(&timeStart);
	TestDotsScalarDouble(NORMAL_COUNT);
	QueryPerformanceCounter(&timeEnd);
	ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
	printf("scalar double: %gms\n", ms);

	QueryPerformanceCounter(&timeStart);
	TestDotsSIMD(NORMAL_COUNT);
	QueryPerformanceCounter(&timeEnd);
	ms = (timeEnd.QuadPart - timeStart.QuadPart) * perfToMS;
	printf("SIMD         : %gms\n", ms);

	_getch();
	return 0;
}

Results:

Code: Select all

scalar float : 0.0111997ms
scalar double: 0.00703982ms
SIMD         : 0.0143996ms

The double version is 30-40% faster. Just a single floating point variable makes that much of a difference. I was trying to make the code more readable by separating the dot product into a separate variable and noticed it caused the time to be a lot slower. I switched it to a double, for kicks, and the time was about the same. Removed the second variable and made the remaining one a double and, bam, 30+% speed boost! All these years I assumed float was faster or at least on par with double. Now I feel kind of dumb. I did another test switching the arrays to doubles, but this appeared to slow it down again, so the trick seems to be: store large sets of data as floats, use doubles for local variables.

As for the simd... well, it's slower than the floating point version until I get over 9000 elements in the array.

Result with a count of 40000:

Code: Select all

scalar float : 0.363831ms
scalar double: 0.176956ms
SIMD         : 0.332792ms

I find it really strange that it takes that many loops to get a benefit from the simd version... or that the ratios really change much at all after 400 or so... odd. I'm sure my simd version can be optimized, but I'm wondering if it's even possible to compete with the double version.

jitspoe · Post by **jitspoe** » Thu May 30, 2013 5:18 am

Well, I feel dumb again. Looks like the double version was faster because of caching. If I swap the order, the float version is faster. In reality, they're effectively the same... I think.

What's strange, though, is that I had both versions in Paintball2, with a cvar to alternate between the two, and it seemed there was a 3% or so overall FPS boost using the double version.

Edit: Scratch that, I screwed some stuff up when I was testing, the double version is, indeed, faster, but the first run through is a little slower regardless.

With a few more runs:

Code: Select all

scalar float : 0.358391ms
scalar double: 0.178556ms
scalar float : 0.255674ms
scalar double: 0.162876ms
SIMD         : 0.327992ms

jitspoe · Post by **jitspoe** » Thu May 30, 2013 5:59 pm

Another update:

Using the /fp:fast optimization makes the double and float performance the same (so this is probably something you should make sure you have enabled on your engines if you don't, since we generally care more about speed than precision).

I've managed to make the SIMD version faster than the others by changing the data structure of the normals. Instead of having an array of vec3's, I changed it to 3 aligned arrays of floats. This avoids creating the local float[4]'s, which I think was causing the biggest performance hit.

That said, though, storing the normals in that format isn't ideal for other calculations, and you couldn't pass those arrays directly to OpenGL (I don't think). So... meh... SIMD optimization is tough.

r00k · Post by **r00k** » Fri May 31, 2013 3:21 am

i found some sse code for things like square root and othe math lib replacement
if i detect sse at init will this simple asm code be faster than the normal functions?
i mean for quake1 is it much benifit
it looks like a simple cut/paste codewise

revelator · Post by **revelator** » Fri May 31, 2013 10:29 am

depends for engines like tenebrae it makes quite a difference because it calculates a lot of heavy stuff on the cpu darkplaces seems to get along with the macro math functions just fine though but its also a lot more refined than tenebrae was in its days. Quake2xp ditto since it uses glslang for the gfx intensive calculations.

Guess the only way to find out is benchmarking after changes and then compare results.

jitspoe · Post by **jitspoe** » Fri May 31, 2013 10:02 pm

SSE is usually not a simple cut/paste. You have to set your data up so things can be run in parallel, and often the price you pay to load/store/shuffle data around is greater than the benefit you get. You may end up having to rearchitect data structures and all the code that uses them in order to see a benefit.

If you're talking about replacing a simple sqrt with the SSE version, then it's going to be slower. SSE instructions are set up to run 4 instructions at once (in the typical case of 32bit floats on __m128's). In fact, you couldn't just call a SSE function with a floating point value. You have to load it to the SIMD registers, then store it back to floating point values.

Spike · Post by **Spike** » Fri May 31, 2013 11:08 pm

aye, sse is not good for dot products (unlike 3dnow), which are pretty common in 3d engines.
supposedly this stuff is meant to be improved with the sse4 instructions.
don't underestimate random transposes.

but yeah, avoid copying data from x87 to sse and back. if you use a little sse on your data, use a lot instead.
and yeah, glsl is prefered for calculating dotproducts for lighting, as well as skeletal transforms and stuff.

revelator · Post by **revelator** » Sat Jun 01, 2013 12:27 am

Still trying to wrap my noggin around multi threading, though Theres lots of implementations gcc/omp -> pthreads msvc equivalent -> winthreads Intel has there own version with even more optimization pragmas than msvc and some other Company i cant remember the name off uses a special pthreads syntax also. Compatibility is so/so Intels pragmas cannot be used by msvc without there package installed besides the
very basic commands msvc understands itself, andd the same goes for that other Company. Atm i think gcc has the upper hand when it comes to omp use unless you buy Intels compiler.

Interresting thing and a good point to support the discussion here is an example where a guy modified Doom3 to use omp/pthreads for multithreading and achived some very interresting results speedwise.
SSo yeah SSE definatly likes having a lot of data thrown at it

revelator · Post by **revelator** » Sat Jun 01, 2013 12:30 am

Sorry for the double Words in the above post, my keyboard needs a repair (mechanical razer blackwidow with the dreaded key chattering bug).

jitspoe · Post by **jitspoe** » Tue Jun 04, 2013 5:15 am

From what I've read, the new dot product "instructions" end up taking just as many (if not more) cycles as doing it with the old instruction sets. I'll have to dig into the 3DNow stuff. I didn't realize it was optimized for dot products. I find it kind of strange that SSE wouldn't have that from the beginning. It's kind of fundamental for vector math...

As far as multithreading goes (a bit off topic), a friend of mine was suggesting openmp. I haven't tried that. I've done a little with pthread (you can get a library so you can run it in windows). I've also used some #defines to allow me to use the same syntax for threading on different OS's.

revelator · Post by **revelator** » Tue Jun 04, 2013 5:42 am

Only brought multithreading into the discussion because off SSE thread handles

(kinda need multithreadding to run stuff in parallel).

I Work with Mingw64/Codeblocks daily, MinGW64 has its own pthread library (winpthread) though the old one also Works fine.

jitspoe · Post by **jitspoe** » Wed Jun 05, 2013 2:31 am

A couple more updates:

I came up with a SIMD version to do this calculation that, when the normal was loaded in the register, ended up being faster than the scalar version:

Code: Select all

void TestDotsSIMDAlt1 (int count)
{
	int j;
	__m128 one = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);

	__m128 shadelightM = _mm_loadu_ps(LightValF);
	//__m128 normal = _mm_loadu_ps(inNormalsArray[0]); // testing a scenario where we already have the normal in memory...

	for (j = 0; j < count; ++j)
	{
		__m128 normal = _mm_loadu_ps(inNormalsArray[j]);
		//__m128 normal = inNormalsArrayM[j];
		//__m128 normal = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j][1], inNormalsArray[j][2], 0.0f);
		register __m128 normmul = _mm_mul_ps(normal, LightAngle);
		register __m128 dot = _mm_add_ss(_mm_shuffle_ps(normmul, normmul, 1), _mm_add_ps(_mm_movehl_ps(normmul, normmul), normmul));
		register __m128 lightcalc = _mm_add_ss(_mm_mul_ss(_mm_add_ss(dot, one), halflight), ambient);
		register __m128 shuffled = _mm_shuffle_ps(lightcalc, lightcalc, 0);
		register __m128 out = _mm_mul_ps(shadelightM, shuffled);
		_mm_storeu_ps(colorArray[j], out);
	}
}

As it it is there, using the loadu, it's slower. If I use the one before the loop, it's 3x's faster than the non-simd version (kind of what I was hoping to get). I figured if I moved the calculations into the loop that generates the normal, I'd have it already in a register, and it would be faster...

It wasn't

Seems no matter what I do, the non-simd version is faster.

Well, that's not entirely true. I do think I have a SIMD version that's SLIGHTLY faster:

Code: Select all

void TestDotsSIMD (int count)
{
	__m128 shadelightM = _mm_loadu_ps(LightValF);
	int j;

	for (j = 0; j < count; j += 4)
	{
		// todo: align for better performance, and is there a faster way to store these in general?
		// Note: Aligning seems to have no performance impact unless the memory is actually unaligned (compiler automatically optimizes this?)
//#define LOAD_VER2
#define LOAD_VER3
//#define LOAD_VER4
#ifdef LOAD_VER4
		__m128 normalsXm = _mm_set_ps(inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0]);
		__m128 normalsYm = _mm_set_ps(inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1]);
		__m128 normalsZm = _mm_set_ps(inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2]);
#else
#ifdef LOAD_VER3
		
		__m128 row0 = _mm_loadu_ps(inNormalsArray[j + 0]);
		__m128 row1 = _mm_loadu_ps(inNormalsArray[j + 1]);
		__m128 row2 = _mm_loadu_ps(inNormalsArray[j + 2]);
		__m128 row3 = _mm_loadu_ps(inNormalsArray[j + 3]);

		/*
		__m128 row0 = inNormalsArrayM[j + 0];
		__m128 row1 = inNormalsArrayM[j + 1];
		__m128 row2 = inNormalsArrayM[j + 2];
		__m128 row3 = inNormalsArrayM[j + 3];
*/
		__m128 tmp0 = _mm_shuffle_ps((row0), (row1), 0x44);
		__m128 tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE);
		__m128 tmp1 = _mm_shuffle_ps((row2), (row3), 0x44);
		__m128 tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE);
		__m128 normalsXm = _mm_shuffle_ps(tmp0, tmp1, 0x88);
		__m128 normalsYm = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
		__m128 normalsZm = _mm_shuffle_ps(tmp2, tmp3, 0x88);
		//(row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
#else
#ifdef LOAD_VER2
		/*__m128 normals0 = _mm_loadu_ps(inNormalsArray[j + 0]);
		__m128 normals1 = _mm_loadu_ps(inNormalsArray[j + 1]);
		__m128 normals2 = _mm_loadu_ps(inNormalsArray[j + 2]);
		__m128 normals3 = _mm_loadu_ps(inNormalsArray[j + 3]);*/
		__m128 normalsXm = _mm_load_ps(inNormalsArrayX + j); // pretend we have data structured differently - see how fast this is...
		__m128 normalsYm = _mm_load_ps(inNormalsArrayY + j);
		__m128 normalsZm = _mm_load_ps(inNormalsArrayZ + j);
#else
		ALIGN(float normalsXf[4]) = { inNormalsArray[j][0], inNormalsArray[j + 1][0], inNormalsArray[j + 2][0], inNormalsArray[j + 3][0] };
		ALIGN(float normalsYf[4]) = { inNormalsArray[j][1], inNormalsArray[j + 1][1], inNormalsArray[j + 2][1], inNormalsArray[j + 3][1] };
		ALIGN(float normalsZf[4]) = { inNormalsArray[j][2], inNormalsArray[j + 1][2], inNormalsArray[j + 2][2], inNormalsArray[j + 3][2] };
		__m128 normalsXm = _mm_load_ps(normalsXf);
		__m128 normalsYm = _mm_load_ps(normalsYf);
		__m128 normalsZm = _mm_load_ps(normalsZf);
#endif
#endif
#endif

		__m128 lightOut = _mm_add_ps(_mm_mul_ps(
			_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(normalsXm, lightdirX), _mm_mul_ps(normalsYm, lightdirY)), _mm_mul_ps(normalsZm, lightdirZ)), one),
			halflight), ambient);
		__m128 lightOut0 = _mm_shuffle_ps(lightOut, lightOut, 0x00);
		__m128 lightOut1 = _mm_shuffle_ps(lightOut, lightOut, 0x55);
		__m128 lightOut2 = _mm_shuffle_ps(lightOut, lightOut, 0xAA);
		__m128 lightOut3 = _mm_shuffle_ps(lightOut, lightOut, 0xFF);
//#define WRITE_COLOR_MM
#ifdef WRITE_COLOR_MM
		colorArrayM[j    ] = _mm_mul_ps(lightOut0, shadelightM);
		colorArrayM[j + 1] = _mm_mul_ps(lightOut1, shadelightM);
		colorArrayM[j + 2] = _mm_mul_ps(lightOut2, shadelightM);
		colorArrayM[j + 3] = _mm_mul_ps(lightOut3, shadelightM);
#else
		__m128 colorOut0 = _mm_mul_ps(lightOut0, shadelightM);
		__m128 colorOut1 = _mm_mul_ps(lightOut1, shadelightM);
		__m128 colorOut2 = _mm_mul_ps(lightOut2, shadelightM);
		__m128 colorOut3 = _mm_mul_ps(lightOut3, shadelightM);
		_mm_storeu_ps(colorArray[j + 0], colorOut0);
		_mm_storeu_ps(colorArray[j + 1], colorOut1);
		_mm_storeu_ps(colorArray[j + 2], colorOut2);
		_mm_storeu_ps(colorArray[j + 3], colorOut3);
#endif
	}
}

I haven't actually tried it in practice, yet, but in my test program it's like 5-10% faster. I'm not sure if it's worth replacing a super simple loop with a huge block of difficult to read code, though. And in the end, it might end up being slower, like everything else I've tried...

I figured it probably didn't matter too much because I was going to try using OpenGL lighting at some point, so it would be hardware accelerated on cards that supported it. I tried that -- passed the normal array in, enabled GL_LIGHTING, and didn't compute the color array. It did not, however, run faster. It may have even been slower (timings were so close, it was difficult to tell). I added a second light for kicks, and it was definitely slower in that case..

So now I'm wondering... is it actually computing the lighting on the hardware, or are the drivers doing it in software? I have an nVidia GeForce GTX 285.

I'm kind of baffled that I've spent like 2 days on this, and the basic, software for loop full of scalar operations seems to be the fastest approach.

revelator · Post by **revelator** » Wed Jun 05, 2013 4:25 am

Some code from MH i use with my own fork of Vanilla doom3.

Replaces memcpy with an asm optimized version and its way faster than anything i have tried so far

Code: Select all

static void *RB_MemCpy(void *dst, const void *src, size_t count)
{
	__asm
	{
		mov			esi, dword ptr [src]
		mov			edi, dword ptr [dst]

		cmp			dword ptr [count], 64
		jl			TryCopyQWord32

CopyQWord64:
		movq		mm0, [esi]
		movq		mm1, [esi + 8]
		movq		mm2, [esi + 16]
		movq		mm3, [esi + 24]
		movq		mm4, [esi + 32]
		movq		mm5, [esi + 40]
		movq		mm6, [esi + 48]
		movq		mm7, [esi + 56]
		add			esi, 64

		movntq		[edi], mm0
		movntq		[edi + 8], mm1
		movntq		[edi + 16], mm2
		movntq		[edi + 24], mm3
		movntq		[edi + 32], mm4
		movntq		[edi + 40], mm5
		movntq		[edi + 48], mm6
		movntq		[edi + 56], mm7
		add			edi, 64

		sub			dword ptr [count], 64
		cmp			dword ptr [count], 64
		jge			CopyQWord64

TryCopyQWord32:
		cmp			dword ptr [count], 32
		jl			TryCopyQWord16

CopyQWord32:
		movq		mm0, [esi]
		movq		mm1, [esi + 8]
		movq		mm2, [esi + 16]
		movq		mm3, [esi + 24]
		add			esi, 32

		movntq		[edi], mm0
		movntq		[edi + 8], mm1
		movntq		[edi + 16], mm2
		movntq		[edi + 24], mm3
		add			edi, 32

		sub			dword ptr [count], 32
		cmp			dword ptr [count], 32
		jge			CopyQWord32

TryCopyQWord16:
		cmp			dword ptr [count], 16
		jl			TryCopyQWord8

CopyQWord16:
		movq		mm0, [esi]
		movq		mm1, [esi + 8]
		add			esi, 16

		movntq		[edi], mm0
		movntq		[edi + 8], mm1
		add			edi, 16

		sub			dword ptr [count], 16
		cmp			dword ptr [count], 16
		jge			CopyQWord16

TryCopyQWord8:
		cmp			dword ptr [count], 8
		jl			TryCopyDWord

CopyQWord8:
		movq		mm0, [esi]
		add			esi, 8

		movntq		[edi], mm0
		add			edi, 8

		sub			dword ptr [count], 8
		cmp			dword ptr [count], 8
		jge			CopyQWord8

TryCopyDWord:
		cmp			dword ptr [count], 3
		jle			TryCopyWord

		mov			ecx, dword ptr [count]
		shr			ecx, 2
		mov			eax, ecx
		rep movsd

		shl			eax, 2
		sub			dword ptr [count], eax

TryCopyWord:
		cmp			dword ptr [count], 1
		jle			TryCopyByte

		movsw

		sub			dword ptr [count], 2

TryCopyByte:
		cmp			dword ptr [count], 0
		je			CopyDone

		movsb

CopyDone:
		emms
		sfence
		mov			eax, [dst]
	}
}

InsideQC Forums

SIMD/SSE Instructions

SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions

Re: SIMD/SSE Instructions