
;****************************************************************************
;*
;* This is the SSE code form Crytek (CrySkinFull.cpp) which I will port to the 64-bit assembler.

;;;;;#ifdef _CPU_X86
;;;;;__declspec (align(32)) // align by cache line boundaries
;;;;;CryBBoxA16 CrySkinFull::g_BBox;
;;;;;
;;;;;void CrySkinFull::skinSSE (const Matrix44* pBones, Vec3dA16* pDest)
;;;;;{	
;;;;;	//PROFILE_FRAME_SELF(PureSkin);
;;;;;const Matrix44* pBone = pBones + m_numSkipBones, *pBoneEnd = pBones + m_numBones;
;;;;;	CrySkinAuxInt* pAux = &m_arrAux[0];
;;;;;	Vertex* pVertex = &m_arrVertices[0];
;;;;;	// set the bbox to the negative volume to make sure the bbox will calculate starting from the first vertex
;;;;;	g_BBox.vMin.v = Vec3d(1e6,1e6,1e6);// = pBone->GetTranslation();
;;;;;	g_BBox.vMax.v = Vec3d(-1e6,-1e6,-1e6);// = pBone->GetTranslation();
;;;;;
;;;;;#if FOR_TEST
;;;;;	for (int i = 0; i < CryAnimationBase::GetCVars()->ca_TestSkinningRepeats(); ++i)
;;;;;#endif


; Here's the C prototype for this function:
;
; static void Amd64Skinner(CrySkinAuxInt* pAux, Vertex* pVertex, Vec3dA16* pDest,
;                          const Matrix44* pBone, Vec3dA16* vMin);
;
; Entry parameters:
;
; rcx      = pAux
; rdx      = pVertex
; r8       = pDest
; r9       = pBone
; [rsp+40] = pvMin			; After the call but before the push rbp
; [rsp+48] = pBoneEnd
;

pvMin = 48					; After the push rbp
pBoneEnd = 56				; "


CRY_SKIN_AUX_INT_SIZE	equ	2		; This must match the define in CrySkinTypes.h
;CRY_SKIN_AUX_INT_SIZE	equ	4

_text	SEGMENT

PUBLIC Amd64Skinner

Amd64Skinner	PROC	FRAME

			push	rbp
;.pushreg rbp
			sub		rsp, 30h
;.allocstack 048
			mov		rbp, rsp
;.setframe rbp, 0			

			movdqa	[rsp], xmm6
;.savexmm128	xmm6, 0

			movdqa	[rsp+16], xmm7			
;.savexmm128	xmm7, 16 

; Save non-volatile regs

			push	rsi
;.pushreg rsi
			push	rdi
;.pushreg rdi
			push	rbx
;.pushreg rbx

.endprolog

; For debug, I will copy the parameters into the same registers which Crytek used in the inline assembler.

			mov rbx, rdx					; Was   mov EBX, pVertex
			mov	rdx, rcx					;		mov EDX, pAux
			mov	rdi, r8						;		mov EDI, pDest
			mov rsi, r9						;		mov ESI, pBone

			; load the current matrix; we don't need the move component
	startLoop:
			cmp	rsi, pBoneEnd[rbp]			; Was	cmp ESI, pBoneEnd
			jz endLoop
			movaps xmm0, [rsi]				;		movaps xmm0, [ESI]
			movaps xmm1, [rsi+010h]			;		movaps xmm1, [ESI+0x10]
			movaps xmm2, [rsi+020h]			;		movaps xmm2, [ESI+0x20]
			movaps xmm3, [rsi+030h]			;		movaps xmm3, [ESI+0x30]
			add rsi, 040h					;		add ESI, 0x40

			; load the counter for the number of non-flipped tangets for this bone
	if CRY_SKIN_AUX_INT_SIZE EQ 2
			xor ECX,ECX
			mov CX, word ptr [rdx]			; Was	mov CX, word ptr [EDX]
			add rdx, 2						;		add EDX, 2
	else
			mov ECX, dword ptr [rdx]		; Was	mov ECX, dword ptr [EDX]
			add rdx, 4						;		add EDX, 4
	endif
			test ECX, ECX
			jz endLoopRigid

	startLoopRigid:
			; load the offset
			movaps xmm7, [rbx]				; Was	movaps xmm7, [EBX]
			; calculate the destination pointer
			mov rax, [rbx+0Ch]				;		mov EAX, [EBX+0xC]
			and rax, 0FFFFFFh
			add rax, rax
			; rdi+rax*8 (EDI+EAX*8) points to the destination vector now
			add rbx, 010h					;		add EBX, 0x10

			; transform the vertex
			movss xmm6, xmm7
			shufps xmm6, xmm6, 0			; xmm6 = 4 copies of offset.x
			mulps xmm6, xmm0
			movaps xmm5, xmm7
			shufps xmm5, xmm5, 055h			; xmm5 = 4 copies of offset.y
			mulps xmm5, xmm1
			shufps xmm7, xmm7, 0AAh			; xmm7 = 4 copies of offset.z
			mulps xmm7, xmm2
			addps xmm7, xmm5
			addps xmm7, xmm6
			addps xmm7, xmm3				; xmm7 = fully transformed vertex, store it
			; xmm7 = transformed vertex
			movaps [rdi+rax*8], xmm7		; Was	movaps [EDI+EAX*8], xmm7

			;----------------------
			; Calculation of BBox
			; xmm5 will be the min, xmm6 will be the max of bbox
			movaps xmm5, xmm7
			movaps xmm6, xmm7
			mov	  r10, qword ptr pvMin[rbp]		; For AMD we have to get the ptr
			minps xmm5, qword ptr [r10]			; Was	minps xmm5, g_BBox.vMin AMD64 Optimization Opportunity!
			maxps xmm6, qword ptr [r10 + 12]	;		maxps xmm6, g_BBox.vMax       Change these to a register
			movaps qword ptr [r10], xmm5		;		movaps g_BBox.vMin, xmm5
			movaps qword ptr [r10 + 12], xmm6	;		movaps g_BBox.vMax, xmm6
			;----------------------

			loop startLoopRigid
			;loop startLoopNonflipped
	endLoopRigid:

			;/////////////////////////////////////////////////////////
			;// Smooth-1 loop

			; load the counter for the number of smooth vertices met for the first time
	if CRY_SKIN_AUX_INT_SIZE EQ 2
			xor ECX,ECX
			mov CX, word ptr [rdx]			; Was	mov CX, word ptr [EDX]
			add rdx, 2						;		add EDX, 2
	else
			mov ECX, dword ptr [rdx]		;		mov ECX, dword ptr [EDX]
			add rdx, 4						;		add EDX, 4
	endif
			test ECX, ECX
			jz endLoopSmooth1

	startLoopSmooth1:
			; load the offset & blending
			movaps xmm7, [rbx]				; Was	movaps xmm7, [EBX]
			; calculate the destination pointer
	if CRY_SKIN_AUX_INT_SIZE EQ 2
			xor EAX,EAX
			mov AX, word ptr [rdx]			;		mov AX, word ptr [EDX]
			add rdx, 2						;		add EDX, 2
	else
			mov eax, dword ptr [rdx]		;		mov EAX, dword ptr [EDX]
			add rdx, 4						;		add EDX, 4
	endif
			shl	rax,1						; Was	add rax, rax

			; rdi+rax*8 (EDI+EAX*8) points to the destination vector now
			add rbx, 010h					;		add EBX, 010h	bump the vertex ptr

			; transform the vertex
			movss xmm6, xmm7
			shufps xmm6, xmm6, 0			; xmm6 = 4 copies of offset.x
			mulps xmm6, xmm0
			movaps xmm5, xmm7
			shufps xmm5, xmm5, 055h			; xmm5 = 4 copies of offset.y
			mulps xmm5, xmm1
			movaps xmm4, xmm7
			shufps xmm4, xmm4, 0AAh			; xmm4 = 4 copies of offset.z
			mulps xmm4, xmm2
			addps xmm4, xmm5
			addps xmm4, xmm6
			addps xmm4, xmm3				; xmm4 = fully transformed vertex, blend it
			shufps xmm7, xmm7, 0FFh			; xmm7 = 4 copies of blending
			mulps xmm7, xmm4
			; xmm7 = transformed and blended vertex
			movaps [rdi+rax*8], xmm7		; Was	movaps [EDI+EAX*8], xmm7

			loop startLoopSmooth1
			;loop startLoopNonflipped
	endLoopSmooth1:

			;/////////////////////////////////////////////////////////
			;// Smooth-2 loop

			;// load the counter for the number of smooth vertices met for the second time
	if CRY_SKIN_AUX_INT_SIZE EQ 2
			xor ECX,ECX
			mov CX, word ptr [rdx]			; Was	mov CX, word ptr [EDX]
			add rdx, 2						;		add EDX, 2
	else
			mov ECX, dword ptr [rdx]		;		mov ECX, dword ptr [EDX]
			add rdx, 4						;		add EDX, 4
	endif
			test ECX, ECX
			jz endLoopSmooth2

	startLoopSmooth2:
			; load the offset & blending
			movaps xmm7, [rbx]				;		movaps xmm7, [EBX]
			; calculate the destination pointer
	if CRY_SKIN_AUX_INT_SIZE EQ 2
			xor EAX,EAX
			mov ax, word ptr [rdx]			; Was	mov AX, word ptr [EDX]
			add rdx, 2						;		add EDX, 2
	else
			mov eax, dword ptr [rdx]		;		mov EAX, dword ptr [EDX]
			add rdx, 4						;		add EDX, 4
	endif
			shl rax, 4						;		shl EAX, 4
			add rax, rdi					;		add EAX, EDI
			; rax points to the destination vector now Was EAX
			add rbx, 010h					;		add EBX, 010h	vertex ptr

			; transform the vertex
			movss xmm6, xmm7
			shufps xmm6, xmm6, 0			; xmm6 = 4 copies of offset.x
			mulps xmm6, xmm0
			movaps xmm5, xmm7
			shufps xmm5, xmm5, 055h			; xmm5 = 4 copies of offset.y
			mulps xmm5, xmm1
			movaps xmm4, xmm7
			shufps xmm4, xmm4, 0AAh			; xmm4 = 4 copies of offset.z
			mulps xmm4, xmm2
			addps xmm4, xmm5
			addps xmm4, xmm6
			addps xmm4, xmm3				; xmm4 = fully transformed vertex, blend it
			shufps xmm7, xmm7, 0FFh			; xmm7 = 4 copies of blending
			mulps xmm7, xmm4
			; xmm7 = transformed and blended vertex
			addps xmm7, [rax]				; Was	addps xmm7, [EAX]
			movaps [rax], xmm7				;		movaps [EAX], xmm7

			loop startLoopSmooth2
			; loop startLoopNonflipped
	endLoopSmooth2:

			jmp startLoop
	endLoop:
	
			pop		rbx
			pop		rdi
			pop		rsi
	
			movdqa	xmm7, [rsp+16]
			movdqa	xmm6, [rsp]
			
			add		rsp, 048
			pop		rbp
			ret

Amd64Skinner	ENDP

_text	ENDS

END