asm memcpy + memset for testing
Moderator: InsideQC Admins
5 posts
• Page 1 of 1
asm memcpy + memset for testing
the memcpy aint by me its from berserker quake2
the memset is a port i did from id's unix version for sdl
they seem to work pretty nicely but not sure if there actually faster.
the memset is a port i did from id's unix version for sdl
- Code: Select all
typedef enum
{
PRE_READ, // prefetch assuming that buffer is used for reading only
PRE_WRITE, // prefetch assuming that buffer is used for writing only
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
} e_prefetch;
void Q_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
{
// write buffer prefetching is performed only if
// the processor benefits from it. Read and read/write
// prefetching is always performed.
switch (type)
{
case PRE_WRITE : break;
case PRE_READ:
case PRE_READ_WRITE:
__asm
{
mov ebx,s
mov ecx,bytes
cmp ecx,4096 // clamp to 4kB
jle skipClamp
mov ecx,4096
skipClamp:
add ecx,0x1f
shr ecx,5 // number of cache lines
jz skip
jmp loopie
align 16
loopie: test byte ptr [ebx],al
add ebx,32
dec ecx
jnz loopie
skip:
}
break;
}
}
// optimized memory copy routine that handles all alignment
// cases and block sizes efficiently
void Q_memcpy (void *dest, const void *src, const size_t count) {
Q_Prefetch (src, count, PRE_READ);
__asm
{
push edi
push esi
mov ecx,count
cmp ecx,0 // count = 0 check (just to be on the safe side)
je outta
mov edx,dest
mov ebx,src
cmp ecx,32 // padding only?
jl padding
mov edi,ecx
and edi,~31 // edi = count&~31
sub edi,32
align 16
loopMisAligned:
mov eax,[ebx + edi + 0 + 0*8]
mov esi,[ebx + edi + 4 + 0*8]
mov [edx+edi+0 + 0*8],eax
mov [edx+edi+4 + 0*8],esi
mov eax,[ebx + edi + 0 + 1*8]
mov esi,[ebx + edi + 4 + 1*8]
mov [edx+edi+0 + 1*8],eax
mov [edx+edi+4 + 1*8],esi
mov eax,[ebx + edi + 0 + 2*8]
mov esi,[ebx + edi + 4 + 2*8]
mov [edx+edi+0 + 2*8],eax
mov [edx+edi+4 + 2*8],esi
mov eax,[ebx + edi + 0 + 3*8]
mov esi,[ebx + edi + 4 + 3*8]
mov [edx+edi+0 + 3*8],eax
mov [edx+edi+4 + 3*8],esi
sub edi,32
jge loopMisAligned
mov edi,ecx
and edi,~31
add ebx,edi // increase src pointer
add edx,edi // increase dst pointer
and ecx,31 // new count
jz outta // if count = 0, get outta here
padding:
cmp ecx,16
jl skip16
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
mov dword ptr [edx+4],eax
mov eax,dword ptr [ebx+8]
mov dword ptr [edx+8],eax
mov eax,dword ptr [ebx+12]
mov dword ptr [edx+12],eax
sub ecx,16
add ebx,16
add edx,16
skip16:
cmp ecx,8
jl skip8
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
sub ecx,8
mov dword ptr [edx+4],eax
add ebx,8
add edx,8
skip8:
cmp ecx,4
jl skip4
mov eax,dword ptr [ebx] // here 4-7 bytes
add ebx,4
sub ecx,4
mov dword ptr [edx],eax
add edx,4
skip4: // 0-3 remaining bytes
cmp ecx,2
jl skip2
mov ax,word ptr [ebx] // two bytes
cmp ecx,3 // less than 3?
mov word ptr [edx],ax
jl outta
mov al,byte ptr [ebx+2] // last byte
mov byte ptr [edx+2],al
jmp outta
skip2:
cmp ecx,1
jl outta
mov al,byte ptr [ebx]
mov byte ptr [edx],al
outta:
pop esi
pop edi
}
}
void *Q_memset(void* dest0, int val, size_t count0)
{
union {
byte bytes[8];
unsigned short words[4];
unsigned int dwords[2];
} dat;
byte *dest = (byte *)dest0;
int count = count0;
while( count > 0 && (((int)dest) & 7) ) {
*dest = val;
dest++;
count--;
}
if ( !count ) {
return dest0;
}
dat.bytes[0] = val;
dat.bytes[1] = val;
dat.words[1] = dat.words[0];
dat.dwords[1] = dat.dwords[0];
if ( count >= 64 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
movq mm1, dat // Read in source data
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
movq mm5, mm1
movq mm6, mm1
movq mm7, mm1
movq mm0, mm1
loop1:
movntq 0[edi], mm1 // Non-temporal stores
movntq 8[edi], mm2
movntq 16[edi], mm3
movntq 24[edi], mm4
movntq 32[edi], mm5
movntq 40[edi], mm6
movntq 48[edi], mm7
movntq 56[edi], mm0
add edi, 64
dec ecx
jnz loop1
}
dest += ( count & ~63 );
count &= 63;
}
if ( count >= 8 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
movq mm1, dat // Read in source data
loop2:
movntq 0[edi], mm1 // Non-temporal stores
add edi, 8
dec ecx
jnz loop2
}
dest += (count & ~7);
count &= 7;
}
while( count > 0 ) {
*dest = val;
dest++;
count--;
}
__asm emms
return dest0;
}
they seem to work pretty nicely but not sure if there actually faster.
-

revelator - Posts: 2567
- Joined: Thu Jan 24, 2008 12:04 pm
- Location: inside tha debugger
The lines in Q_memset starting with movntq are generating the following error "inline assembler syntax error in 'opcode'; found 'constant'"
I suspect it's a compiler issue, as I was using VC 6.
I suspect it's a compiler issue, as I was using VC 6.
http://red.planetarena.org - Alien Arena and the CRX engine
- Irritant
- Posts: 250
- Joined: Mon May 19, 2008 2:54 pm
- Location: Maryland
reckless wrote:got the processor pack ? cause the memset function uses sse instructions maybe why the compiler bitches
can try by removing the 64 bit block since thats whats holding the sse instructions.
No I don't, but does that work on XP and Vista? I had heard there were problems with it.
Also, are there additional dependencies needed in Linux for this? Changes in makefile perhaps?
I'd like to give it a try, but I'm realy scared of breaking stuff...
http://red.planetarena.org - Alien Arena and the CRX engine
- Irritant
- Posts: 250
- Joined: Mon May 19, 2008 2:54 pm
- Location: Maryland
aye works just fine you need a hacked sp5 if on vista (prerequisite to the processor pack) tho cause the one from microsoft wont install on vista on xp it works right out of the box got it save and sound here on my ftp
cause sometimes i need compatibility libs and NET compilers arent nice in that regard (runtime library hell on ice)
could also download a trial version of intels compiler (still compatible with msvc6)
in fact all ms compilers work fine on vista if you disable UAC even vc7 or the old vc4 hell i even have borland 6 running
there should be a link to my ftp in my thread about quake2xp the package is named vs6sp5_vista.EXE
in regards to windows 7 things get a bit tougher (seems ms is hellbent on killing all there old software)
could also download a trial version of intels compiler (still compatible with msvc6)
in fact all ms compilers work fine on vista if you disable UAC even vc7 or the old vc4 hell i even have borland 6 running
there should be a link to my ftp in my thread about quake2xp the package is named vs6sp5_vista.EXE
in regards to windows 7 things get a bit tougher (seems ms is hellbent on killing all there old software)
-

revelator - Posts: 2567
- Joined: Thu Jan 24, 2008 12:04 pm
- Location: inside tha debugger
5 posts
• Page 1 of 1
Who is online
Users browsing this forum: No registered users and 1 guest