glTexSubImage2D Performance

mh · Post by mh » Fri Jul 23, 2010 3:24 pm

I've spent days beating on this one in the RMQ engine, and now you get to share the results.

Several of the maps have a very heavy dynamic/animated light content, with almost every surface in a scene having some kind of dynamic lighting on it, so the performance of dynamic light updates is utterly critical to overall performance of the engine. Most of the time it was perfectly fine, but there was one machine I tested on where things ground down to 10 FPS.

After trying various workarounds (PBOs, updating per-surface vs bulk updating), I finally did what I should have done in the first place and ground out some code to measure actual timings inside glTexSubImage2D. The results were quite surprising.

GLQuake by default uses internal format GL_RGBA, format GL_RGBA and type GL_UNSIGNED_BYTE. However, the fastest mode in all cases (even on machines that didn't slow down) turned out to actually be GL_RGBA/GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV. This needs OpenGL 1.2 support so I guess 3DFX owners are excluded, but everyone else can join the party.

Measurements:

Code: Select all

mode: 0  320ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 1  317ms [GL_BGRA/GL_UNSIGNED_BYTE] (OK)
mode: 2  377ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 3  375ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 4  376ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)
mode: 5   12ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)

Sample test app (SDL, mostly portable):

Code: Select all

#define WINDOW_WIDTH	800
#define WINDOW_HEIGHT	600

#include "SDL.h"
#include "SDL_opengl.h"

#pragma comment (lib, "SDL.lib")
#pragma comment (lib, "SDLmain.lib")
#pragma comment (lib, "opengl32.lib")

#define TEX_WIDTH 512
#define TEX_HEIGHT 512

unsigned int sibuffer[TEX_WIDTH * TEX_HEIGHT];
unsigned int teximage = 0;
unsigned int framecount = 0;

// find the fastest modes to use for glTexSubImage2D
typedef struct tsitest_s
{
	char formatstr[64];
	char typestr[64];
	GLenum format;
	GLenum type;
	int modespeed;
	bool failed;
} tsitest_t;

tsitest_t tsimodes[] =
{
	{"GL_RGBA", "GL_UNSIGNED_BYTE", GL_RGBA, GL_UNSIGNED_BYTE, 666, true},
	{"GL_BGRA", "GL_UNSIGNED_BYTE", GL_BGRA, GL_UNSIGNED_BYTE, 666, true},
	{"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
	{"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, 666, true},
	{"GL_RGBA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true},
	{"GL_BGRA", "GL_UNSIGNED_INT_8_8_8_8_REV", GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, 666, true}
};


int fastest = 666;
int bestspeed = 32768;

GLuint R_MakeMeATexture (int width, int height, GLenum format, GLenum type)
{
	GLuint texnum = 0;

	glEnable (GL_TEXTURE_2D);
	glGenTextures (1, &texnum);
	glBindTexture (GL_TEXTURE_2D, texnum);
	glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
	glTexParameteri (GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
	glTexImage2D (GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, format, type, NULL);

	// commit the buffer so that timings are valid
	glFinish ();

	return texnum;
}


void R_SetTSIMode (void)
{
	int numtsimodes = sizeof (tsimodes) / sizeof (tsitest_t);

	for (int i = 0; i < numtsimodes; i++)
	{
		// clear last the error (if any)
		glGetError ();

		// create a new texture object
		GLuint texnum = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes[i].format, tsimodes[i].type);

		Uint32 start = SDL_GetTicks ();

		// SDL_GetTicks has insufficient resolution to measure one call so we need to run a few of them
		for (int t = 0; t < 16; t++)
			glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes[i].format, tsimodes[i].type, sibuffer);

		Uint32 end = SDL_GetTicks ();

		// commit the buffer so that timings are valid
		glFinish ();

		glDeleteTextures (1, &texnum);

		if (glGetError () != GL_NO_ERROR)
			tsimodes[i].failed = true;
		else tsimodes[i].failed = false;

		tsimodes[i].modespeed = (end - start);
	}

	for (int i = 0; i < numtsimodes; i++)
	{
		printf ("mode: %i %4ims [%s/%s] (%s)\n", i, tsimodes[i].modespeed, tsimodes[i].formatstr, 
			tsimodes[i].typestr, tsimodes[i].failed ? "FAILED" : "OK");

		if (tsimodes[i].modespeed <= bestspeed && !tsimodes[i].failed)
		{
			bestspeed = tsimodes[i].modespeed;
			fastest = i;
		}
	}

	if (fastest == 666)
	{
		MessageBox (NULL, "Failed to find a format!", "Error", MB_OK | MB_ICONSTOP);
		exit (0);
	}
}


void RenderOpenGL (void)
{
	framecount++;

	for (int i = 0, w = 0; w < TEX_WIDTH; w++)
	{
		for (int h = 0; h < TEX_HEIGHT; h++, i++)
		{
			unsigned char *rgba = (unsigned char *) &sibuffer[i];

			// 2 == red, 1 == green, 0 == blue
			rgba[2] = ((h * i) + framecount) & 255;
			rgba[1] = ((w * h) + framecount) & 255;
			rgba[0] = ((w * i) + framecount) & 255;
			rgba[3] = 255;
		}
	}

	glClear (GL_COLOR_BUFFER_BIT);

	glViewport (0, 0, WINDOW_WIDTH, WINDOW_HEIGHT);

	glMatrixMode (GL_MODELVIEW);
	glLoadIdentity ();

	glMatrixMode (GL_PROJECTION);
	glLoadIdentity ();
	glOrtho (0, WINDOW_WIDTH, WINDOW_HEIGHT, 0, -99999, 99999);

	glBindTexture (GL_TEXTURE_2D, teximage);
	glTexEnvi (GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);

	glTexSubImage2D (GL_TEXTURE_2D, 0, 0, 0, TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type, sibuffer);

	glBegin (GL_QUADS);

	glTexCoord2f (0, 0);
	glVertex2f (0, 0);

	glTexCoord2f (1, 0);
	glVertex2f (TEX_WIDTH, 0);

	glTexCoord2f (1, 1);
	glVertex2f (TEX_WIDTH, TEX_HEIGHT);

	glTexCoord2f (0, 1);
	glVertex2f (0, TEX_HEIGHT);

	glEnd ();
}


int main (int argc, char *argv[])
{
	if (SDL_Init (SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) != 0)
	{
		printf ("Unable to initialize SDL: %s\n", SDL_GetError ());
		return 1;
	}

	SDL_GL_SetAttribute (SDL_GL_DOUBLEBUFFER, 1);
	SDL_Surface *screen = SDL_SetVideoMode (WINDOW_WIDTH, WINDOW_HEIGHT, 32, SDL_OPENGL);

	R_SetTSIMode ();
	teximage = R_MakeMeATexture (TEX_WIDTH, TEX_HEIGHT, tsimodes[fastest].format, tsimodes[fastest].type);
	glClearColor (0, 0, 0, 1);

	int done = 0;
	SDL_Event evt;

	while (!done)
	{
		while (!done && SDL_PollEvent (&evt))
		{
			if (evt.type == SDL_QUIT)
			{
				done = 1;
				break;
			}
		}

		// run the screen update here
		RenderOpenGL ();
		SDL_GL_SwapBuffers ();
	}

	return 0;
}

Compile and run, then when you get the timings out entertain yourself with the pretty psychedelic picture.

It's definitely worthwhile including something like this in your video startup code and adjusting your formats and types.

mh · Post by mh » Sat Jul 24, 2010 12:37 am

I should add that the timing figures above are for my aberrant driver. Don't expect 30x performance on all hardware, although 2x/3x (lightmap update speed, not overall) is not unreasonable.

Also note that this was time explicitly spent inside glTexSubImage2D, and totally isolated to that function call. I suspect that the driver was pulling the teximage data back to system memory in order to do the update, but we'll probably never know for certain.

Spike · Post by **Spike** » Sun Jul 25, 2010 12:14 pm

mode: 0 91ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 1 32ms [GL_BGRA/GL_UNSIGNED_BYTE] (OK)
mode: 2 38ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 3 46ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 4 22ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)
mode: 5 19ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)

I guess that's the advantage of not using intel. :P

But yeah, its fairly well known that GPUs favour BGR byte ordering.

I don't understand the big difference between unsigned_byte and int_8_8_8_8_* though.
hmm

mode: 0 441ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 1 289ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 2 286ms [GL_BGRA/GL_UNSIGNED_BYTE] (OK)
mode: 3 287ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 4 313ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 5 277ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)
mode: 6 277ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)

new mode :P
also increased the loop count.

Strange that the first upload is so much slower, and even more so when uploaded 256 times.

But yeah, GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV is always the fastest on any pc/ppc hardware, or at least equal to the fastest.
Running it multiple times, there's a fair amount of variation.

mh · Post by mh » Sun Jul 25, 2010 2:04 pm

I'm quite stumped at that one too. I'd never really even heard of the type before and the OpenGL docs aren't exactly shouting about it from the rooftops, but yet it dates back to OpenGL 1.2 so support for it should be pretty much ubiquitous.

A good bit faster than GL_RGBA/GL_UNSIGNED_BYTE even on non-Intel hardware and I guess that the 3ms difference with GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV is the overhead of the swizzling.

The first upload being slower would likely be explained by the driver doing a lazy texture bind, so it would be interesting to adjust the code a little to more accurately reflect Quake lightmap uploads, say by using 16 different textures. I'm guessing that the lazy bind would turn out to be most accurately reflective of the Real World under those circumstances.

The only reasonable explanation I can come up with for GL_UNSIGNED_INT_8_8_8_8_REV performance is that it's transferring 32 bits at a time, whereas GL_UNSIGNED_BYTE maybe transfers 8 bits at a time? I would have thought that drivers should show some more intelligence there, recognise an RGBA or BGRA format, and adjust accordingly, but it seems not so.

r00k · Post by **r00k** » Sun Jul 25, 2010 10:24 pm

Using BGRA instead I obviously get all blue tinted textures, without manipulating more code, would using RGBA and GL_UNSIGNED_INT_8_8_8_8_REV really be that much slower?
Oddly if I use lit files ON i get > 2000fps in a timerefresh VS. ~ 1300 without. I know timerefresh is just a lazy test, but hey.

Spike · Post by **Spike** » Sun Jul 25, 2010 10:58 pm

for me, RGBA + GL_UNSIGNED_INT_8_8_8_8_REV was nearly the same speed as BGRA (task schedules meant that it could sometimes be faster).
For mh, his drivers suck on everything but those that are made super-easy.
As always, it depends upon your drivers. Its just that BGRA always gives the driver less work. Whether it can get the card to do it all automatically is a different matter.
Depends on your drivers.

mh · Post by mh » Sun Jul 25, 2010 11:50 pm

Just swap R and B in your lightmap builder.

Yeah, it depends on drivers. The swizzing operation might be effectively free on some hardware, so you may not have to worry about it, other drivers might be intelligent enough to recognize a 32 bit format and transfer it through a fast path, the particular driver I was testing on, I reckon it was pulling everything back to system memory in order to do the update with most combinations.

The native GPU format will always be the best, all other factors being equal. Testing different formats and picking the fastest at startup time seems quite sensible to me; adjusting the palette and lightmap builder to match isn't really that much overhead.

I wouldn't trust timerefresh as a benchmark, it writes to the frontbuffer (which is eeeeeeeeevil on modern hardware) so there are a whole other set of variables at work there. A good timedemo with lots of flashing lights but not much else in the way of stress on the renderer would be more representative and would isolate the performance difference better. Run it with -nomtex and r_lightmap 1 for the full effect. Try ID1 demo1 or demo3 with entities and particles disabled, for example.

It would be interesting to get timings for this on ATI, or has that already been done? Being designed more around the D3D spec I would expect that they would behave more similarly to Intel (although obviously not in the same league as my shitty driver).

Spike · Post by **Spike** » Mon Jul 26, 2010 12:16 am

GL_VENDOR: ATI Technologies Inc.
GL_RENDERER: ATI Radeon HD 3200 Graphics
GL_VERSION: 3.3.9836 Compatibility Profile Context
GL_EXTENSIONS: 164 extensions

tis a laptop. that's my excuse for low FPS, and I'm sticking to it.
As you saw earlier, the swizzling is sufficient that it doesn't really matter.

timerefresh is effective enough, but you do have to bear in mind that its static, non-moving, and has a fixed time (read: no pvs changes so no cache flushes, no lightstyle animations, so at most one lightmap update).

I seriously doubt that uploading as RGBA will ever be faster than BGRA, unless other things on the system interrupt and skew your app's perspective of time. I'd also be surprised if your drivers didn't support it.

mh · Post by mh » Mon Jul 26, 2010 12:35 am

Spike wrote:I'd also be surprised if your drivers didn't support it.

Agreed. BGRA dates back to OpenGL 1.2 which is 1998 technology. It's also native to D3D (although D3D calls it ARGB, but it's in BGRA order - go figure (hmmmm - 8_8_8_8_REV...

)) which doesn't even have a 32-bit RGBA format, so concerns that it's in any way "new", "edgy" or "dangerous" should be out the window. It would be even more reasonable to be susprised that drivers still supported RGBA these days. (Of course OpenGL lets them emulate it in software, as I found out. Yes, I'm sore about it, and no, I don't care how often I say so.

)

Unless one is massively concerned about maintaining 3DFX compatibility there's no reason not to use it.

Spike · Post by **Spike** » Mon Jul 26, 2010 12:56 am

I'd be surprised if 3dfx didn't support it. even microsoft support it and they don't support anything!

okay, so the 8_8_8_8_rev thing might not be, but GL_BGRA_EXT... its gonna be supported.

mh · Post by mh » Mon Jul 26, 2010 12:59 am

Spike wrote:I'd be surprised if 3dfx didn't support it. even microsoft support it and they don't support anything!

okay, so the 8_8_8_8_rev thing might not be, but GL_BGRA_EXT... its gonna be supported.

That's in the 1.2 #defines as well. Maybe not a candidate on 3DFX though, but otherwise it's supported.

mh · Post by mh » Thu Sep 16, 2010 8:29 pm

New figures for an Nvidia GT230M:

Code: Select all

mode: 0   17ms [GL_RGBA/GL_UNSIGNED_BYTE] (OK)
mode: 1    3ms [GL_BGRA/GL_UNSIGNED_BYTE] (OK)
mode: 2   10ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 3   14ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8] (OK)
mode: 4   13ms [GL_RGBA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)
mode: 5    3ms [GL_BGRA/GL_UNSIGNED_INT_8_8_8_8_REV] (OK)

Just switching from RGBA to BGRA is almost one-sixth the time spent in the driver.

It's only really needed for lightmaps though, so you just need to change around your R_BuildLightmap code; not a big deal.

r00k · Post by **r00k** » Sat Oct 23, 2010 8:03 am

for the gpu to jive nicely with texture uploads is it best to init this way?

static PIXELFORMATDESCRIPTOR pfd =
{
sizeof(PIXELFORMATDESCRIPTOR), // size of this pfd
1, // version number
PFD_DRAW_TO_WINDOW // support window
| PFD_SUPPORT_OPENGL // support OpenGL
| PFD_DOUBLEBUFFER , // double buffered
PFD_TYPE_RGBA, // RGBA type
32, // 24-bit color depth
0, 0, 0, 0, 0, 0, // color bits ignored
0, // no alpha buffer
0, // shift bit ignored
0, // no accumulation buffer
0, 0, 0, 0, // accum bits ignored
16, // 24-bit z-buffer
8, // 8-bit stencil buffer
0, // no auxiliary buffer
PFD_MAIN_PLANE, // main layer
0, // reserved
0, 0, 0 // layer masks ignored
};

mh · Post by mh » Sat Oct 23, 2010 9:02 am

You're never going to get 24-bit colour no matter what you specify (the closest possible is 32-bit with 8 of those unused) so it doesn't really matter. Keeping Z at 24 is OK to do as well; the most important thing there is to make sure that you clear depth and stencil at the same time.

r00k · Post by **r00k** » Sun Oct 24, 2010 5:09 am

ok I thought the gpu liked 32 bit uploads instead of 24 because if it was 24 it had to convert to 32 for native processing or something..