I've been working on the Q2DOS port for quite a few years with Sezero and one of the TODOs that have stared me in the face the entire time is the inline assembly for the particle drawing/blending code that was MSVC specific in r_part.c. Specifically R_DrawParticle() function. I did some reading on ASM and first moved the 3 blending declspec "naked" functions inside R_DrawParticle, made sure that worked. Then I broke this entire code out to an actual ASM file for use with ML.EXE. Still good. From here I started the conversion to GAS. It seems pretty straightforward, basically from what I understand all src, dst is switched on mov, cmp, etc. Typically things are declared by size for these functions like movl, movb, and so on. No big deal there. A gotcha (but the compiler is good enough to warn on this) is floating point math such as faddp, fsubp, etc. have the registers reversed.
Anyways, I converted it over. The engine didn't blow up (woo-hoo!) but no particles. I looked closer and noticed if I went to q2dm1 looked up at the sky and turned on cl_testparticles 1 I can see part of the test particles. If I fire the gun at the sky and look down quickly I can see part of it. So it looks like the z-clipping plane is wrong and the screen centering. I'm guessing an alignment issue? I have ruled out that it could be the 24-bit floating point mode that is set in some inline ASM because commenting that out in MSVC makes no difference, probably a small speed difference but I can port that part later. I have no idea at this point. But relevant code:
MASM version:
Code: Select all
.386P
.model FLAT
;
; r_parta.s
; x86 assembly-language particle code
;
include qasm.inc
include d_if.inc
if id386
_DATA SEGMENT
_DATA ENDS
CONST SEGMENT
eight_thousand_hex dd 047000000r
PARTICLE_33 equ 0
PARTICLE_66 equ 1
PARTICLE_OPAQUE equ 2
CONST ENDS
_BSS SEGMENT
short_izi DW 01H DUP (?)
align 4
zi DD 01h DUP (?)
u DD 01H DUP (?)
v DD 01H DUP (?)
tmp DD 01H DUP (?)
transformed_vec DD 03H DUP (?)
local_vec DD 03H DUP (?)
ebpsave DD 01H DUP (?)
_BSS ENDS
_TEXT SEGMENT
align 4
public _R_DrawParticle
_R_DrawParticle:
;
; save trashed variables
;
mov dword ptr [ebpsave], ebp
push esi
push edi
;
; transform the particle
;
; VectorSubtract (pparticle->origin, r_origin, local);
mov esi, dword ptr [_partparms+partparms_particle]
fld dword ptr [esi+0] ; p_o.x
fsub dword ptr [_r_origin+0] ; p_o.x-r_o.x
fld dword ptr [esi+4] ; p_o.y | p_o.x-r_o.x
fsub dword ptr [_r_origin+4] ; p_o.y-r_o.y | p_o.x-r_o.x
fld dword ptr [esi+8] ; p_o.z | p_o.y-r_o.y | p_o.x-r_o.x
fsub dword ptr [_r_origin+8] ; p_o.z-r_o.z | p_o.y-r_o.y | p_o.x-r_o.x
fxch st(2) ; p_o.x-r_o.x | p_o.y-r_o.y | p_o.z-r_o.z
fstp dword ptr [local_vec+0] ; p_o.y-r_o.y | p_o.z-r_o.z
fstp dword ptr [local_vec+4] ; p_o.z-r_o.z
fstp dword ptr [local_vec+8] ; (empty)
; transformed[0] = DotProduct(local, r_pright);
; transformed[1] = DotProduct(local, r_pup);
; transformed[2] = DotProduct(local, r_ppn);
fld dword ptr [local_vec+0] ; l.x
fmul dword ptr [_r_pright+0] ; l.x*pr.x
fld dword ptr [local_vec+4] ; l.y | l.x*pr.x
fmul dword ptr [_r_pright+4] ; l.y*pr.y | l.x*pr.x
fld dword ptr [local_vec+8] ; l.z | l.y*pr.y | l.x*pr.x
fmul dword ptr [_r_pright+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp st(1), st ; l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp st(1), st ; l.x*pr.x + l.y*pr.y + l.z*pr.z
fstp dword ptr [transformed_vec+0] ; (empty)
fld dword ptr [local_vec+0] ; l.x
fmul dword ptr [_r_pup+0] ; l.x*pr.x
fld dword ptr [local_vec+4] ; l.y | l.x*pr.x
fmul dword ptr [_r_pup+4] ; l.y*pr.y | l.x*pr.x
fld dword ptr [local_vec+8] ; l.z | l.y*pr.y | l.x*pr.x
fmul dword ptr [_r_pup+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp st(1), st ; l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp st(1), st ; l.x*pr.x + l.y*pr.y + l.z*pr.z
fstp dword ptr [transformed_vec+4] ; (empty)
fld dword ptr [local_vec+0] ; l.x
fmul dword ptr [_r_ppn+0] ; l.x*pr.x
fld dword ptr [local_vec+4] ; l.y | l.x*pr.x
fmul dword ptr [_r_ppn+4] ; l.y*pr.y | l.x*pr.x
fld dword ptr [local_vec+8] ; l.z | l.y*pr.y | l.x*pr.x
fmul dword ptr [_r_ppn+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp st(1), st(0) ; l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp st(1), st(0) ; l.x*pr.x + l.y*pr.y + l.z*pr.z
fstp dword ptr [transformed_vec+8] ; (empty)
;
; make sure that the transformed particle is not in front of
; the particle Z clip plane. We can do the comparison in
; integer space since we know the sign of one of the inputs
; and can figure out the sign of the other easily enough.
;
; if (transformed[2] < PARTICLE_Z_CLIP)
; return;
mov eax, dword ptr [transformed_vec+8]
and eax, eax
js endpartfunc
cmp eax, float_particle_z_clip
jl endpartfunc
;
; project the point by initiating the 1/z calc
;
; zi = 1.0 / transformed[2];
fld float_1
fdiv dword ptr [transformed_vec+8]
; prefetch the next particle
mov ebp, ds:dword ptr [_s_prefetch_address]
mov ebp, [ebp]
; finish the above divide
fstp dword ptr [zi]
; u = (int)(xcenter + zi * transformed[0] + 0.5);
; v = (int)(ycenter - zi * transformed[1] + 0.5);
fld dword ptr [zi] ; zi
fmul dword ptr [transformed_vec+0] ; zi * transformed[0]
fld dword ptr [zi] ; zi | zi * transformed[0]
fmul dword ptr [transformed_vec+4] ; zi * transformed[1] | zi * transformed[0]
fxch st(1) ; zi * transformed[0] | zi * transformed[1]
fadd ds:dword ptr[_xcenter] ; xcenter + zi * transformed[0] | zi * transformed[1]
fxch st(1) ; zi * transformed[1] | xcenter + zi * transformed[0]
fld ds:dword ptr[_ycenter] ; ycenter | zi * transformed[1] | xcenter + zi * transformed[0]
fsubrp st(1), st(0) ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0]
fxch st(1) ; xcenter + zi * transformed[0] | ycenter + zi * transformed[1]
fadd float_point5 ; xcenter + zi * transformed[0] + 0.5 | ycenter - zi * transformed[1]
fxch st(1) ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0] + 0.5
fadd float_point5 ; ycenter - zi * transformed[1] + 0.5 | xcenter + zi * transformed[0] + 0.5
fxch st(1) ; u | v
fistp dword ptr [u] ; v
fistp dword ptr [v] ; (empty)
;
; clip out the particle
;
; if ((v > d_vrectbottom_particle) ||
; (u > d_vrectright_particle) ||
; (v < d_vrecty) ||
; (u < d_vrectx))
; {
; return;
; }
mov ebx, u
mov ecx, v
cmp ecx, ds:dword ptr [_d_vrectbottom_particle]
jg endpartfunc
cmp ecx, ds:dword ptr [_d_vrecty]
jl endpartfunc
cmp ebx, ds:dword ptr [_d_vrectright_particle]
jg endpartfunc
cmp ebx, ds:dword ptr [_d_vrectx]
jl endpartfunc
;
; compute addresses of zbuffer, framebuffer, and
; compute the Z-buffer reference value.
;
; EBX = U
; ECX = V
;
; Outputs:
; ESI = Z-buffer address
; EDI = framebuffer address
;
; ESI = d_pzbuffer + (d_zwidth * v) + u;
mov esi, ds:dword ptr[_d_pzbuffer] ; esi = d_pzbuffer
mov eax, ds:dword ptr[_d_zwidth] ; eax = d_zwidth
mul ecx ; eax = d_zwidth*v
add eax, ebx ; eax = d_zwidth*v+u
shl eax, 1 ; eax = 2*(d_zwidth*v+u)
add esi, eax ; esi = ( short * ) ( d_pzbuffer + ( d_zwidth * v ) + u )
; initiate
; izi = (int)(zi * 0x8000);
fld dword ptr [zi]
fmul eight_thousand_hex
; EDI = pdest = d_viewbuffer + d_scantable[v] + u;
lea edi, ds:dword ptr _d_scantable[ecx*4]
mov edi, dword ptr [edi]
add edi, ds:dword ptr[_d_viewbuffer]
add edi, ebx
; complete
; izi = (int)(zi * 0x8000);
fistp dword ptr [tmp]
mov eax, dword ptr [tmp]
mov word ptr [short_izi], ax
;
; determine the screen area covered by the particle,
; which also means clamping to a min and max
;
; pix = izi >> d_pix_shift;
xor edx, edx
mov dx, word ptr [short_izi]
mov ecx, ds:dword ptr[_d_pix_shift]
shr dx, cl
; if (pix < d_pix_min)
; pix = d_pix_min;
cmp edx, ds:dword ptr[_d_pix_min]
jge check_pix_max
mov edx, ds:dword ptr[_d_pix_min]
jmp skip_pix_clamp
; else if (pix > d_pix_max)
; pix = d_pix_max;
check_pix_max:
cmp edx, ds:dword ptr[_d_pix_max]
jle skip_pix_clamp
mov edx, ds:dword ptr[_d_pix_max]
skip_pix_clamp:
;
; render the appropriate pixels
;
; ECX = count (used for inner loop)
; EDX = count (used for outer loop)
; ESI = zbuffer
; EDI = framebuffer
;
mov ecx, edx
cmp ecx, 1
ja over
over:
;
; at this point:
;
; ECX = count
;
push ecx
push edi
push esi
top_of_pix_vert_loop:
top_of_pix_horiz_loop:
; for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
; {
; for (i=0 ; i<pix ; i++)
; {
; if (pz[i] <= izi)
; {
; pdest[i] = blendparticle( color, pdest[i] );
; }
; }
; }
xor eax, eax
mov ax, word ptr [esi]
cmp ax, word ptr [short_izi]
jg end_of_horiz_loop
mov eax, ds:dword ptr [_partparms+partparms_color]
cmp ds:dword ptr [_partparms+partparms_level], PARTICLE_66
je blendfunc_66
jl blendfunc_33
; BlendParticle100
mov byte ptr [edi], al
jmp done_blending
blendfunc_33:
mov ebp, ds:dword ptr [_vid+vid_alphamap]
xor ebx, ebx
mov bl, byte ptr [edi]
shl ebx, 8
add ebp, ebx
add ebp, eax
mov al, byte ptr [ebp]
mov byte ptr [edi], al
jmp done_blending
blendfunc_66:
mov ebp, ds:dword ptr [_vid+vid_alphamap]
xor ebx, ebx
shl eax, 8
mov bl, byte ptr [edi]
add ebp, ebx
add ebp, eax
mov al, byte ptr [ebp]
mov byte ptr [edi], al
done_blending:
add edi, 1
add esi, 2
end_of_horiz_loop:
dec ecx
jnz top_of_pix_horiz_loop
pop esi
pop edi
mov ebp, ds:dword ptr[_d_zwidth]
shl ebp, 1
add esi, ebp
add edi, ds:dword ptr [_r_screenwidth]
pop ecx
push ecx
push edi
push esi
dec edx
jnz top_of_pix_vert_loop
pop ecx
pop ecx
pop ecx
endpartfunc:
pop edi
pop esi
mov ebp, dword ptr[ebpsave]
ret
_TEXT ENDS
endif ;id386
END
Code: Select all
//
// r_parta.s
// x86 assembly-language particle code.
//
#include "qasm.h"
#if id386
.data
eight_thousand_hex: .long 32768
.bss
.lcomm short_izi, 1
.align 4
.lcomm zi, 1, 4
.lcomm u, 1, 4
.lcomm v, 1, 4
.lcomm tmp, 1, 4
.lcomm transformed_vec, 12, 4
.lcomm local_vec, 12, 4
.lcomm ebpsave, 1, 4
.text
#define PARTICLE_33 0
#define PARTICLE_66 1
#define PARTICLE_OPAQUE 2
.align 4
.globl C(R_DrawParticle)
C(R_DrawParticle):
//
// save trashed variables
//
movl %ebp, ebpsave
pushl %esi
pushl %edi
//
// transform the particle
//
// VectorSubtract (pparticle->origin, r_origin, local);
movl C(partparms)+partparms_particle, %esi
flds 0(%esi) // p_o.x
fsubs C(r_origin) // p_o.x-r_o.x
flds 4(%esi) // p_o.y | p_o.x-r_o.x
fsubs C(r_origin)+4 // p_o.y-r_o.y | p_o.x-r_o.x
flds 8(%esi) // p_o.z | p_o.y-r_o.y | p_o.x-r_o.x
fsubs C(r_origin)+8 // p_o.z-r_o.z | p_o.y-r_o.y | p_o.x-r_o.x
fxch %st(2) // p_o.x-r_o.x | p_o.y-r_o.y | p_o.z-r_o.z
fstps local_vec+0 // p_o.y-r_o.y | p_o.z-r_o.z
fstps local_vec+4 // p_o.z-r_o.z
fstps local_vec+8 // (empty)
// transformed[0] = DotProduct(local, r_pright);
// transformed[1] = DotProduct(local, r_pup);
// transformed[2] = DotProduct(local, r_ppn);
flds local_vec+0 // l.x
fmuls C(r_pright)+0 // l.x*pr.x
flds local_vec+4 // l.y | l.x*pr.x
fmuls C(r_pright)+4 // l.y*pr.y | l.x*pr.x
flds local_vec+8 // l.z | l.y*pr.y | l.x*pr.x
fmuls C(r_pright)+8 // l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch %st(2) // l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y + l.z*pr.z
fstps transformed_vec+0 // (empty)
flds local_vec+0 // l.x
fmuls C(r_pup)+0 // l.x*pr.x
flds local_vec+4 // l.y | l.x*pr.x
fmuls C(r_pup)+4 // l.y*pr.y | l.x*pr.x
flds local_vec+8 // l.z | l.y*pr.y | l.x*pr.x
fmuls C(r_pup)+8 // l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch %st(2) // l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y + l.z*pr.z
fstps transformed_vec+4 // (empty)
flds local_vec+0 // l.x
fmuls C(r_ppn)+0 // l.x*pr.x
flds local_vec+4 // l.y | l.x*pr.x
fmuls C(r_ppn)+4 // l.y*pr.y | l.x*pr.x
flds local_vec+8 // l.z | l.y*pr.y | l.x*pr.x
fmuls C(r_ppn)+8 // l.z*pr.z | l.y*pr.y | l.x*pr.x
fxch %st(2) // l.x*pr.x | l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y | l.z*pr.z
faddp %st(0), %st(1) // l.x*pr.x + l.y*pr.y + l.z*pr.z
fstps transformed_vec+8 // (empty)
//
// make sure that the transformed particle is not in front of
// the particle Z clip plane. We can do the comparison in
// integer space since we know the sign of one of the inputs
// and can figure out the sign of the other easily enough.
//
// if (transformed[2] < PARTICLE_Z_CLIP)
// return;
movl transformed_vec+8, %eax
andl %eax, %eax
js endpartfunc
cmpl float_particle_z_clip, %eax
jl endpartfunc
//
// project the point by initiating the 1/z calc
//
// zi = 1.0 / transformed[2];
flds float_1
fdiv transformed_vec+8
// prefetch the next particle
movl C(s_prefetch_address), %ebp
movl (%ebp), %ebp
// finish the above divide
fstps zi
// u = (int)(xcenter + zi * transformed[0] + 0.5)
// v = (int)(ycenter - zi * transformed[1] + 0.5)
flds zi // zi
fmuls transformed_vec+0 // zi * transformed[0]
flds zi // zi | zi * transformed[0]
fmuls transformed_vec+4 // zi * transformed[1] | zi * transformed[0]
fxch %st(1) // zi * transformed[0] | zi * transformed[1]
fadds C(xcenter) // xcenter + zi * transformed[0] | zi * transformed[1]
fxch %st(1) // zi * transformed[1] | xcenter + zi * transformed[0]
flds C(ycenter) // ycenter | zi * transformed[1] | xcenter + zi * transformed[0]
fsubrp %st(0), %st(1) // ycenter - zi * transformed[1] | xcenter + zi * transformed[0]
fxch %st(1) // xcenter + zi * transformed[0] | ycenter + zi * transformed[1]
fadds float_point5 // xcenter + zi * transformed[0] + 0.5 | ycenter - zi * transformed[1]
fxch %st(1) // ycenter - zi * transformed[1] | xcenter + zi * transformed[0] + 0.5
fadds float_point5 // ycenter - zi * transformed[1] + 0.5 | xcenter + zi * transformed[0] + 0.5
fxch %st(1) // u | v
fistps u // v
fistps v // (empty)
//
// clip out the particle
//
// if ((v > d_vrectbottom_particle) ||
// (u > d_vrectright_particle) ||
// (v < d_vrecty) ||
// (u < d_vrectx))
// {
// return;
// }
movl u, %ebx
movl v, %ecx
cmpl C(d_vrectbottom_particle), %ecx
jg endpartfunc
cmpl C(d_vrecty), %ecx
jl endpartfunc
cmpl C(d_vrectright_particle), %ebx
jg endpartfunc
cmpl C(d_vrectx), %ebx
jl endpartfunc
//
// compute addresses of zbuffer, framebuffer, and
// compute the Z-buffer reference value.
//
// EBX = U
// ECX = V
//
// Outputs:
// ESI = Z-buffer address
// EDI = framebuffer address
//
// ESI = d_pzbuffer + (d_zwidth * v) + u;
movl C(d_pzbuffer), %esi // esi = d_pzbuffer
movl C(d_zwidth), %eax // eax = d_zwidth
mull %ecx // eax = d_zwidth*v
addl %ebx, %eax // eax = d_zwidth*v+u
shll $1, %eax // eax = 2*(d_zwidth*v+u)
addl %eax, %esi // esi = ( short * ) ( d_pzbuffer + ( d_zwidth * v ) + u )
// initiate
// izi = (int)(zi * 0x8000);
flds zi
fimuls eight_thousand_hex
// EDI = pdest = d_viewbuffer + d_scantable[v] + u;
leal C(d_scantable)(,%ecx,4),%edi
movl (%edi), %edi
addl C(d_viewbuffer), %edi
addl %ebx, %edi
// complete
// izi = (int)(zi * 0x8000);
fistps tmp
movl tmp, %eax
movw %ax, (short_izi)
//
// determine the screen area covered by the particle,
// which also means clamping to a min and max
//
// pix = izi >> d_pix_shift;
xorl %edx, %edx
movw (short_izi), %dx
movl C(d_pix_shift), %ecx
shrw %cl, %dx
// if (pix < d_pix_min)
// pix = d_pix_min;
cmpl C(d_pix_min), %edx
jge check_pix_max
movl C(d_pix_min), %edx
jmp skip_pix_clamp
// else if (pix > d_pix_max)
// pix = d_pix_max;
check_pix_max:
cmpl C(d_pix_max), %edx
jle skip_pix_clamp
movl C(d_pix_max), %edx
skip_pix_clamp:
//
// render the appropriate pixels
//
// ECX = count (used for inner loop)
// EDX = count (used for outer loop)
// ESI = zbuffer
// EDI = framebuffer
//
movl %edx, %ecx
cmpl $1, %ecx
ja over
over:
//
// at this point:
//
// ECX = count
//
pushl %ecx
pushl %edi
pushl %esi
top_of_pix_vert_loop:
top_of_pix_horiz_loop:
// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
// {
// for (i=0 ; i<pix ; i++)
// {
// if (pz[i] <= izi)
// {
// pdest[i] = blendparticle( color, pdest[i] );
// }
// }
// }
xorl %eax, %eax
movw (%esi), %ax
cmpw (short_izi), %ax
jg end_of_horiz_loop
movl C(partparms)+partparms_color, %eax
cmpl $PARTICLE_66, C(partparms)+partparms_level
je blendfunc_66
jl blendfunc_33
// BlendParticle100
movb %al, (%edi)
jmp done_blending
blendfunc_33:
movl C(vid)+vid_alphamap, %ebp
xorl %ebx, %ebx
movb (%edi), %bl
shll $8, %ebx
addl %ebx, %ebp
addl %eax, %ebp
movb (%ebp), %al
movb %al, (%edi)
jmp done_blending
blendfunc_66:
movl C(vid)+vid_alphamap, %ebp
xorl %ebx, %ebx
shll $8, %eax
movb (%edi), %bl
addl %ebx, %ebp
addl %eax, %ebp
movb (%ebp), %al
movb %al, (%edi)
done_blending:
addl $1, %edi
addl $2, %esi
end_of_horiz_loop:
decl %ecx
jnz top_of_pix_horiz_loop
popl %esi
popl %edi
movl C(d_zwidth), %ebp
shll $1, %ebp
addl %ebp, %esi
addl C(r_screenwidth), %edi
popl %ecx
pushl %ecx
pushl %edi
pushl %esi
decl %edx
jnz top_of_pix_vert_loop
popl %ecx
popl %ecx
popl %ecx
endpartfunc:
popl %edi
popl %esi
movl ebpsave, %ebp
ret
#endif // id386
Huge thanks to someone who can point me in the right direction and tell me what I did wrong so I can learn from it.