;SSE raymarch engine by Abaddon
;code: TomCat

Divider EQU 67          ; INT8 speed
A EQU 17H
M EQU (3DH-A+2)

RESX EQU 320
RESY EQU 200
SUBS EQU 4

EPS EQU SI    ;eps,1/RESY,1/RESY,1/RESY
LG2 EQU SI+16 ;.3,.3,.3,.3
L2T EQU SI+32 ;0,1/.3,1/.3,1/.3
HALF EQU SI+48;.5,.5,.5,t
ONE EQU SI+64 ;1.,1.,1.,1.

x EQU 4
y EQU 8
z EQU 12

;XMM0: ?
;XMM1: ?
;XMM2: distance (g)
;XMM3: ?
;XMM4: point (q)
;XMM5: rayDirection
;XMM6: rayOrigin
;XMM7: ?

ORG 256
 MOV AL,13H
.0:
 INT 10H
 IMUL DX,BX,7FH
 IMUL CX,BX,3FH
 IMUL AX,BX,38H
 MOV CL,AH
 INC BX
 MOV AX,1010H
 JNS .0

 POP ES
 PUSH RESY
 PUSH 0A000H
 FLD1                   ; 1
 FIDIV WORD [DI]        ; 1/RESY
 FLD ST0                ; 1/RESY 1/RESY
 FADD ST1,ST0           ; 1/RESY eps
 FLDLG2                 ; LG2 1/RESY eps
 FLDL2T                 ; L2T LG2 1/RESY eps
 FMUL ST0,ST0           ; L2T*L2T LG2 1/RESY eps
 FLD ST0                ; L2T*L2T L2T*L2T LG2 1/RESY eps
 FADD ST1,ST0           ; L2T*L2T 2*L2T*L2T LG2 1/RESY eps
 FDIV ST0,ST1           ; .5 2*L2T*L2T LG2 1/RESY eps
 FLD1                   ; 1 .5 2*L2T*L2T LG2 1/RESY eps
 MOV CL,5
.1:
 MOV CH,4
.2:
 ADD SI,SP
 FST DWORD [SI]
 DEC CH
 JNZ .2
 FSTP ST0
 LOOP .1
 FST DWORD [SI]        ; eps = .01

 MOV AL,90H
 OUT 43H,AL
 SUB AL,90H-Divider    ; carry flag means turn on the speaker
 OUT 61H,AL
 OUT 40H,AL
 SALC
 OUT 40H,AL

 MOV DI,8*4
 MOV AX,IRQ
 STOSW
 MOV AX,CS
 STOSW
 POP ES
 MOV BP,768

nextframe:

 FLDLG2
 FMUL ST0,ST0
 FMUL DWORD [HALF]
 FADDP
 FST DWORD [HALF+z]     ; t
 MOV DI,1024+SUBS-1
 MOV AX,RESY/2
maxcol:
nextline:
 INC BX
 MOV CX,-64
nextpixel:
 MOV DX,26*RESY/2       ; DX = 26/eps
 PUSHA                  ; -18:DI SI BP SP BX DX CX AX RESY
 MOV BX,-8

;XMM2: g = 0
 XORPS XMM2,XMM2

;XMM5: D = maxd,x/ry,y/ry,1
 PMOVSXWD XMM5,[BX]
 CVTDQ2PS XMM5,XMM5
 MULPS XMM5,[EPS]       ; * Aspect

;XMM6: O = .5,.5,.5,t
 MOVAPS XMM6,[HALF]

 MOV CX,256*150+3
march:

;XMM4: q = round(O)
 ROUNDPS XMM4,XMM6,0

;q.y = abs(sin(t+q.x*q.z))
 MOVAPS [BP+SI],XMM4
 FLDL2T
 FMUL DWORD [BP+SI+x]
 FMUL DWORD [BP+SI+z]
 FADD ST0,ST1
 FLD ST0
 FRNDINT
 FSUBP
 FMUL ST0,ST0
 FCHS
 FLDL2T
 FMULP
 FADD DWORD [ONE]
 FSTP DWORD [BP+SI+y]
;q = O - q
 MOVAPS XMM4,XMM6
 SUBPS XMM4,[BP+SI]

;dS = len(q)-r
 MOVAPS XMM0,XMM4
 DPPS XMM0,XMM0,11101111B
 SQRTPS XMM0,XMM0
 SUBPS XMM0,[LG2]

 COMISS XMM0,[EPS]
 JNB nohit

;D -= dot(D,q)*q*L2T*L2T*2
 MOVAPS XMM0,XMM4
 DPPS XMM0,XMM5,11101110B
 MULPS XMM0,XMM4
 MULPS XMM0,[L2T]
 SUBPS XMM5,XMM0

 DEC CL
 JZ done

;dS = 1.
 MOVAPS XMM0,[ONE]

nohit:
;g += dS
 ADDPS XMM2,XMM0
 COMISS XMM2,XMM5
 JNB done

;O += dS*D*LG2
 MULPS XMM0,XMM5
 MULPS XMM0,[LG2]
 ADDPS XMM6,XMM0

 DEC CH
 JNZ march

done:
 MOVAPS [BP+SI],XMM5
 FLD DWORD [BP+SI+y]
 FIMUL WORD [maxcol-2]
 FISTP WORD [BX]
 XOR CL,3
 SAR WORD [BX],CL
 MOV [BX+1],CL
 POPA

blur:
 XCHG AX,DX

 TEST CL,SUBS-1
 JNZ .0
 MOV BH,AH
.0:

 ADD AL,128
 JNS .1
 XOR AL,127
.1:
 TEST AH,AH
 JNZ .2
 XOR AL,127
 ADD AL,64
.2:
 MOV [DI],AL
 DEC DI
 TEST BL,BL
 LOOPNZ .3
 TEST AH,AH
 JNZ .3
 TEST CL,SUBS-1
 JNZ .2

.3:
 XCHG AX,DX
 TEST CL,SUBS-1
 JNZ back

 ADD DI,2*SUBS
 ADD CX,2*SUBS
 MOV BL,BH

back:
 CMP CX,RESX-64
 JNE nextpixel

 DEC AX
 CMP AX,-RESY/2
 JG nextline
 PUSH SI
 MOV CH,64000/2/256
 MOV SI,1024
 SUB DI,DI
;HLT
 REP MOVSW
 POP SI

 JMP nextframe           ; then go to next frame

IRQ:
 PUSHA
 PUSH DS
 PUSH CS
 POP DS
 MOV DI,zax

c1:
 MOV AL,0
 INC AX
f1:
 AAM 35
 MOV [c1+1],AL
 JNZ c2
v1:
 MOV AL,1
 OUT 42H,AL
c2:
 MOV AL,0
 INC AX
f2:
 AAM 79
 MOV [c2+1],AL
 JNZ c3
v2:
 MOV AL,1
 OUT 42H,AL
c3:
; MOV AL,0
; INC AX
;f3:
; AAM 35
; MOV [c3+1],AL
; JNZ c4
;v3:
; MOV AL,1
; OUT 42H,AL
c4:
 MOV AL,0
 INC AX
f4:
 AAM 79
 MOV [c4+1],AL
 JNZ c5
v4:
 MOV AL,1
 OUT 42H,AL

c5:
 MOV CX,256*7
 LOOP nexttick

 MOV BX,v1+1
x3:
 DEC BYTE [BX]
 JNZ @F
 INC BYTE [BX]
@@:
 ADD BL,c2-c1
 JPE x3

c6:
 MOV AH,1
 DEC AH
 JNZ nofetch

c7:
 MOV SI,zax
 LODSB
 AAM M
 JZ replay

c9:
 MOV BL,0
 SUB BL,c2-c1
 JNC @F
 MOV BL,c4-c1
@@:
 MOV [DI-zax+c9+1],BL
 ADD AX,0100H+A-1
 MOV [BX+c1-511],CH
 MOV [BX+f1-511],AL
;SHR AL,1
 MOV [BX+v1-511],AL
 DB 3DH                 ; skip next instruction
replay:
 MOV SI,DI
 ADD AH,AH
 MOV [DI-zax+c7+1],SI
nofetch:
 MOV [DI-zax+c6+1],AH
 MOV CH,7
nexttick:
 MOV [DI-zax+c5+1],CX

 POP DS
 MOV AL,20H
 OUT 20H,AL
 POPA
IRET

INCLUDE "NOTES2.INC"

zax:
DB D_4+1-A+M*0,D_4+1-A+M*0,D_4+1-A+M*0
DB G_4+1-A+M*5,D_5+1-A+M*5
DB C_5+1-A+M*0,B_4+1-A+M*0,A_4+1-A+M*0,G_5+1-A+M*5,D_5+1-A+M*2
DB C_5+1-A+M*0,B_4+1-A+M*0,A_4+1-A+M*0,G_5+1-A+M*5,D_5+1-A+M*2
DB C_5+1-A+M*0,B_4+1-A+M*0,C_5+1-A+M*0,A_4+1-A+M*5
DB M*6
