/*
 *	ucontext.c
 *
 *	[U[ReLXg(System V݊)
 *
 *	CLiP - Common Library for P/ECE
 *	Copyright (C) 2015-2016 Naoyuki Sawa
 *
 *	* Sun May 17 15:20:04 JST 2015 Naoyuki Sawa
 *	- 1st [XB
 *	* Tue Nov 29 23:01:43 JST 2016 Naoyuki Sawa
 *	- P/ECEp̎ɉāAWin32pWin64p̎ǉ܂B
 *	  P/ECEp̎́AύXĂ܂B
 *	- Win32p̎ɂẮAL̃y[WɎႪL܂A͐삵܂B
 *	  uCodeProject - Unix ucontext_t Operations on Windows Platformsv(https://www.codeproject.com/tips/4225/unix-ucontext-t-operations-on-windows-platforms)
 *	  L̎́AGetThreadContext(),SetThreadContext() API𗘗pĂ܂AAPI͎s̃Xbhgɑ΂ĐlԂȂdlłB
 *	  (肭悤ɌĂA͂܂܏肭悤ɌĂ邾łB)
 *	  ]č́AGetThreadContext(),SetThreadContext() APIgpɁAAZuŎ鎖ɂ܂B
 *	  AZuŎɂAWin32Win64ꂼŁAsetjmp()/longjmp()̐R[hāA炪sĂ鏈QlɂĎ܂B
 *	- Win64̃AZû͍񂪏߂ĂŁAĂяoKɂĂ̋LQlɂĒ܂B
 *	  QlɂĒLAucontext.c̒́uWin64p̎ɎQlɂvɈpĂ܂B
 *	- Win64pgetcontext(),setcontext(),makecontext()́A/clip/libclip.x64/ucontext_x64.asmɂĎ܂B
 *	  ڍׂ́Aucontext.c̒́uWin64v̎ӏ̃RgQƂĉB
 */
#include "clip.h"
/****************************************************************************
 *	
 ****************************************************************************/
#ifdef  PIECE
static_assert((sizeof(stack_t) ==  8) && (sizeof(mcontext_t) ==  24) && (sizeof(ucontext_t) ==  36), "stack_t\,,mcontext_t\,,ucontext_t\̂̃TCYsłB");	//P/ECE
#endif//PIECE
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if     (defined(_WIN32) && !defined(_WIN64))
static_assert((sizeof(stack_t) ==  8) && (sizeof(mcontext_t) ==  24) && (sizeof(ucontext_t) ==  36), "stack_t\,,mcontext_t\,,ucontext_t\̂̃TCYsłB");	//Win32
#endif//(defined(_WIN32) && !defined(_WIN64))
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#if     (defined(_WIN32) &&  defined(_WIN64))
static_assert((sizeof(stack_t) == 16) && (sizeof(mcontext_t) == 240) && (sizeof(ucontext_t) == 264), "stack_t\,,mcontext_t\,,ucontext_t\̂̃TCYsłB");	//Win64
#endif//(defined(_WIN32) &&  defined(_WIN64))
/****************************************************************************
 *	P/ECE
 ****************************************************************************/
#ifdef  PIECE
/*static*/ void ContextEntry();
/*static*/ void ContextLink();
asm("
		.code
		.align		1
ContextEntry:	;//-------------------------------------;//
		xld.w		%r12, [%sp+ 0]		;//%r12 := arg1,,sl
		xld.w		%r13, [%sp+ 4]		;//%r13 := arg2,,sl
		xld.w		%r14, [%sp+ 8]		;//%r14 := arg3,,sl
		xld.w		%r15, [%sp+12]		;//%r15 := arg4,,sl
		xadd		%sp, %sp,  16		;//
		ret					;//func(arg1,arg2,arg3,arg4)
ContextLink:	;//-------------------------------------;//
		xld.w		%r12, [%sp+0]		;//%r12 := ucp
	;//sv	xadd		%sp, %sp, 4		;//
		xld.w		%r12, [%r12+0]		;//%r12 := uc_link = ucp->uc_link
		cmp		%r12, 0			;//
		xjreq		exit			;//if(!uc_link) { exit(0) }
		xjp		setcontext		;//setcontext(uc_link)
");
/*--------------------------------------------------------------------------*/
int swapcontext(ucontext_t* oucp, const ucontext_t* ucp);
int setcontext(const ucontext_t* ucp);
int getcontext(ucontext_t* ucp);
asm("
		.code
		.align		1
		.global		swapcontext
		.global		setcontext
		.global		getcontext
swapcontext:	;//-------------------------------------;//
		add		%r12, 12		;//%r12 := mcp = &oucp->uc_mcontext
		ld.w		%r4, %sp		;//%r4  := %sp
		ld.w		%r5, [%r4]+		;//%r5  := %pc = [%r4]+
		ld.w		%sp, %r4		;//%sp  := %r4
		ld.w		[%r12]+, %r0		;//mcp->r0 := %r0
		ld.w		[%r12]+, %r1		;//mcp->r1 := %r1
		ld.w		[%r12]+, %r2		;//mcp->r2 := %r2
		ld.w		[%r12]+, %r3		;//mcp->r3 := %r3
		ld.w		[%r12]+, %r4		;//mcp->sp := %r4
		ld.w		[%r12]+, %r5		;//mcp->pc := %r5
		ld.w		%r12, %r13		;//%r12 := ucp				return setcontext(ucp)
setcontext:	;//- - - - - - - - - - - - - - - - - - -;//
		add		%r12, 12		;//%r12 := mcp = &ucp->uc_mcontext
		ld.w		%r0, [%r12]+		;//%r0  := mcp->r0
		ld.w		%r1, [%r12]+		;//%r1  := mcp->r1
		ld.w		%r2, [%r12]+		;//%r2  := mcp->r2
		ld.w		%r3, [%r12]+		;//%r3  := mcp->r3
		ld.w		%r4, [%r12]+		;//%r4  := mcp->sp
		ld.w		%r5, [%r12]+		;//%r5  := mcp->pc
		ld.w		%sp, %r4		;//%sp  := %r4
		ld.w		%r10, 0			;//%r10 := 0
		jp		%r5			;//return  0				'jp.d %rb'͕s
getcontext:	;//-------------------------------------;//
		add		%r12, 12		;//%r12 := mcp = &ucp->uc_mcontext
		ld.w		%r4, %sp		;//%r4  := %sp
		ld.w		%r5, [%r4]+		;//%r5  := %pc = [%r4]+
		ld.w		%sp, %r4		;//%sp  := %r4
		ld.w		[%r12]+, %r0		;//mcp->r0 := %r0
		ld.w		[%r12]+, %r1		;//mcp->r1 := %r1
		ld.w		[%r12]+, %r2		;//mcp->r2 := %r2
		ld.w		[%r12]+, %r3		;//mcp->r3 := %r3
		ld.w		[%r12]+, %r4		;//mcp->sp := %r4
		ld.w		[%r12]+, %r5		;//mcp->pc := %r5
		ld.w		%r10, 0			;//%r10 := 0
		jp		%r5			;//return  0				'jp.d %rb'͕s
");
/*--------------------------------------------------------------------------*/
void makecontext(ucontext_t* ucp, void (*func)(), int argc, ...) {
	void** stack;
	va_list ap;
	stack = (void**)((char*)ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size);
	*--stack = ucp;
	*--stack = ContextLink;
	*--stack = func;
	ucp->uc_mcontext.sp = (stack -= 4);
	ucp->uc_mcontext.pc = ContextEntry;
	if((unsigned)argc > 4) { DIE(); }		//(argc4)ɐB(argc4)ɑΉ鎖\RXg傫ɎۂɎgp鎖͋HB
	va_start(ap, argc);
	while(--argc >= 0) { *stack++ = va_arg(ap, void*); }
	va_end(ap);
}
#endif//PIECE
/****************************************************************************
 *	Win32
 ****************************************************************************/
#if     (defined(_WIN32) && !defined(_WIN64))
static void ContextEntry(ucontext_t* ucp, void (*func)(), void* arg1, void* arg2, void* arg3, void* arg4) {
	(*func)(arg1, arg2, arg3, arg4);
	if(ucp->uc_link) { setcontext(ucp->uc_link); }
	exit(0);
}
/*--------------------------------------------------------------------------*/
__declspec(naked) int getcontext(ucontext_t* ucp) {
	__asm {
			;//[esp+ 0] := retp
			;//[esp+ 4] := ucp
			pop		edx					;//edx := retp = *esp++
			mov		ecx, dword ptr [esp+ 0]			;//ecx := ucp
			mov		dword ptr [ecx+12], ebx			;//ucp->uc_mcontext.ebx := ebx
			mov		dword ptr [ecx+16], esi			;//ucp->uc_mcontext.esi := esi
			mov		dword ptr [ecx+20], edi			;//ucp->uc_mcontext.edi := edi
			mov		dword ptr [ecx+24], ebp			;//ucp->uc_mcontext.ebp := ebp
			mov		dword ptr [ecx+28], esp			;//ucp->uc_mcontext.esp := esp
			mov		dword ptr [ecx+32], edx			;//ucp->uc_mcontext.eip := retp
			mov		eax, 0					;//eax := 0
			jmp		edx					;//return 0
	}
}
/*--------------------------------------------------------------------------*/
__declspec(naked) int setcontext(const ucontext_t* ucp) {
	__asm {
			;//[esp+ 0] := retp
			;//[esp+ 4] := ucp
			mov		ecx, dword ptr [esp+ 4]			;//ecx := ucp
			mov		ebx, dword ptr [ecx+12]			;//ebx := ucp->uc_mcontext.ebx
			mov		esi, dword ptr [ecx+16]			;//esi := ucp->uc_mcontext.esi
			mov		edi, dword ptr [ecx+20]			;//edi := ucp->uc_mcontext.edi
			mov		ebp, dword ptr [ecx+24]			;//ebp := ucp->uc_mcontext.ebp
			mov		esp, dword ptr [ecx+28]			;//esp := ucp->uc_mcontext.esp
			mov		edx, dword ptr [ecx+32]			;//edx := ucp->uc_mcontext.eip
			mov		eax, 0					;//eax := 0
			jmp		edx					;//return 0
	}
}
/*--------------------------------------------------------------------------*/
__declspec(naked) int swapcontext(ucontext_t* oucp, const ucontext_t* ucp) {
	__asm {
			;//[esp+ 0] := retp
			;//[esp+ 4] := oucp
			;//[esp+ 8] := ucp
			pop		edx					;//edx := retp = *esp++
			mov		ecx, dword ptr [esp+ 0]			;//ecx := oucp
			mov		dword ptr [ecx+12], ebx			;//oucp->uc_mcontext.ebx := ebx
			mov		dword ptr [ecx+16], esi			;//oucp->uc_mcontext.esi := esi
			mov		dword ptr [ecx+20], edi			;//oucp->uc_mcontext.edi := edi
			mov		dword ptr [ecx+24], ebp			;//oucp->uc_mcontext.ebp := ebp
			mov		dword ptr [ecx+28], esp			;//oucp->uc_mcontext.esp := esp
			mov		dword ptr [ecx+32], edx			;//oucp->uc_mcontext.eip := retp
			;//- - - - - - - - - - - - - - - - - - - - - - - - - - -;//
			mov		ecx, dword ptr [esp+ 4]			;//ecx := ucp
			mov		ebx, dword ptr [ecx+12]			;//ebx := ucp->uc_mcontext.ebx
			mov		esi, dword ptr [ecx+16]			;//esi := ucp->uc_mcontext.esi
			mov		edi, dword ptr [ecx+20]			;//edi := ucp->uc_mcontext.edi
			mov		ebp, dword ptr [ecx+24]			;//ebp := ucp->uc_mcontext.ebp
			mov		esp, dword ptr [ecx+28]			;//esp := ucp->uc_mcontext.esp
			mov		edx, dword ptr [ecx+32]			;//edx := ucp->uc_mcontext.eip
			mov		eax, 0					;//eax := 0
			jmp		edx					;//return 0
	}
}
/*--------------------------------------------------------------------------*/
void makecontext(ucontext_t* ucp, void (*func)(), int argc, ...) {
	void** stack;
	va_list ap;
	stack = (void**)((char*)ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size);
	stack -= (1/*retp(_~[)*/ + 1/*ucp*/ + 1/*func*/ + 4/*arg1,arg2,arg3,arg4*/);
	ucp->uc_mcontext.esp = stack;
	ucp->uc_mcontext.eip = ContextEntry;
	if((unsigned)argc > 4) { DIE(); }		//(argc4)ɐB(argc4)ɑΉ鎖\RXg傫ɎۂɎgp鎖͋HB
	stack += 1/*retp(_~[)*/;
	*stack++ = ucp;
	*stack++ = func;
	va_start(ap, argc);
	while(--argc >= 0) { *stack++ = va_arg(ap, void*); }
	va_end(ap);
	//X^bN̓e͈ȉ̂悤ɂȂĂ܂B
	//ucp->uc_stack.ss_sp
	//E		
	//E		
	//E		
	//retp(_~[)ucp->uc_mcontext.esp+ 0
	//ucp		ucp->uc_mcontext.esp+ 4
	//func	ucp->uc_mcontext.esp+ 8
	//arg1	ucp->uc_mcontext.esp+12
	//arg2	ucp->uc_mcontext.esp+16
	//arg3	ucp->uc_mcontext.esp+20
	//arg4	ucp->uc_mcontext.esp+24
	//ucp->uc_stack.ss_sp+ucp->uc_stack.ss_size
}
#endif//(defined(_WIN32) && !defined(_WIN64))
/****************************************************************************
 *	Win64
 ****************************************************************************/
#if     (defined(_WIN32) &&  defined(_WIN64))
static void ContextEntry(void* rcx, void* rdx, void* r8, void* r9, ucontext_t* ucp, void (*func)(), void* arg1, void* arg2, void* arg3, void* arg4) {
	//               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~fastcallĂяoKł(Ƃψł)ŏ4̈͏ɃWX^nɂȂBmakecontext()ɂĈX^bNɐςނ߂ɍŏ4_~[ƂB
	(*func)(arg1, arg2, arg3, arg4);
	if(ucp->uc_link) { setcontext(ucp->uc_link); }
	exit(0);
}
/*--------------------------------------------------------------------------*/
// * Tue Nov 29 23:01:43 JST 2016 Naoyuki Sawa
// - Win64pgetcontext(),setcontext(),makecontext()́A/clip/libclip.x64/ucontext_x64.asmɂĎ܂B
//   VS2015x64^[Qbgł́AC\[Xt@CŃCCAZugAAZu\[Xt@C쐬ăAZuKvL邩łB
// - Ql
//   @uYahoomb - WindowsVisualC++găAZuvORpC@ɂāv(http://detail.chiebukuro.yahoo.co.jp/qa/question_detail/q1296433897)
//   AuItBXfC^Cv(http://www.officedaytime.com/)́AuVisual C++CCAZux64ɈڐAv(http://www.officedaytime.com/tips/asm64/index.html)
//   BuSample is best! ASM > hellov(http://www32.atwiki.jp/sampleisbest/pages/82.html)
//   CuYouTube - How to create a Visual Studio 2010, 2012, 2013 or 2015 64 bit Assembly language project.v(https://www.youtube.com/watch?v=QWWTrdNkhBI)
// - L̇@ɏڂĂāAQlɂȂ܂B
//   A@̋Lł́AVS2010gărhݒ̕@Ă܂AVS2015ł͂̕@ł͐ݒł܂łB
//   VS2015ł́AByчCŐĂ@ŁA肭s܂B
//   @BCǂނƁABC̕@ł̓AZuOnlỹvWFNgȂ悤ɎvĂ܂܂AʂCƃAZu݂vWFNgɏo܂B
// - ́A64rbgpCLiPCuvWFNgɁAucontext_x64.asmǉ@́Aȉ̒ʂłB
//   1. VS2015ŁA64rbgpCLiPCuvWFNg(\Home\Share\Piece\clip\libclip.x64\libclip.sln)JB
//   2. \[VGNXv[̃yCŁAulibclipvNbNđIԂɂB
//   3. j[o[́AuvWFNg(P)vˁurh̃JX^}CY(B)...vIB
//   4. uVisual C++ rhJX^}CYt@Cv_CAO\̂ŁAumasm(.targets,.props)vɃ`FbNtAuOKvă_CAOB
//   5. \[VGNXv[̃yCŁAulibclipvENbNAuǉ(D)vˁu̍(G)...vIAucontext_x64.asmǉB
//   ȏłB
//   2.`4.5.̎菇tɂƁAucontext_x64.asm̃rhݒ肪肭ȂȂ̂ŁA菇ɒӂĉB(QlB́u\[Xt@C̒ǉv̏ɂĂ܂B)
//   A2.`4.̐ݒ͈xsΗǂƎv̂ŁÁAPasmt@Cǉ邾ŗǂƎv܂B
/*--------------------------------------------------------------------------*/
void makecontext(ucontext_t* ucp, void (*func)(), int argc, ...) {
	void** stack;
	va_list ap;
	stack = (void**)((char*)ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size);
	stack -= (4/*rcx,rdx,r8,r9ޔp*/ + 1/*ucp*/ + 1/*func*/ + 4/*arg1,arg2,arg3,arg4*/);
	stack = (void**)((intptr_t)stack & ~(16 - 1));	//16 byte aligned
	stack -= 1/*retp(_~[)*/;
	ucp->uc_mcontext.rsp = stack;
	ucp->uc_mcontext.rip = ContextEntry;
	stack += (1/*retp(_~[)*/ + 4/*rcx,rdx,r8,r9ޔp*/);
	*stack++ = ucp;
	*stack++ = func;
	if((unsigned)argc > 4) { DIE(); }		//(argc4)ɐB(argc4)ɑΉ鎖\RXg傫ɎۂɎgp鎖͋HB
	va_start(ap, argc);
	while(--argc >= 0) { *stack++ = va_arg(ap, void*); }
	va_end(ap);
	//X^bN̓e͈ȉ̂悤ɂȂĂ܂B
	//ucp->uc_stack.ss_sp
	//E		
	//E		
	//E		
	//retp(_~[)ucp->uc_mcontext.esp+ 0
	//rcxޔp	ucp->uc_mcontext.esp+ 8	16 byte aligned
	//rdxޔp	ucp->uc_mcontext.esp+16
	//r8ޔp	ucp->uc_mcontext.esp+24
	//r9ޔp	ucp->uc_mcontext.esp+32
	//ucp		ucp->uc_mcontext.esp+40
	//func	ucp->uc_mcontext.esp+48
	//arg1	ucp->uc_mcontext.esp+56
	//arg2	ucp->uc_mcontext.esp+64
	//arg3	ucp->uc_mcontext.esp+72
	//arg4	ucp->uc_mcontext.esp+80
	//ucp->uc_stack.ss_sp+ucp->uc_stack.ss_size
}
#endif//(defined(_WIN32) &&  defined(_WIN64))
/****************************************************************************
 *	getcontext(),setcontext()̎gp
 ****************************************************************************/
#if 0
//getcontext(),setcontext()̎gp
void app_main(){
 volatile int i=0;//'volatile'K{
 ucontext_t c;
 getcontext(&c);
 printf("%d\n",i);
 delay(SEC(1));
 if(++i<=9){setcontext(&c);}
}
#endif
/****************************************************************************
 *	makecontext(),swapcontext()̎gp
 ****************************************************************************/
#if 0
ucontext_t c1,c2,c3;
unsigned char stack[1024];
void func2(const char* fmt,int num){
 printf("func2 called\n");
 do{
  printf(fmt,num);
  swapcontext(&c2,&c1);
 }while(--num);
 //̌c2.uc_linkɏ]c3֑JڂB
}
void func3(){
 printf("func3 called\n");
 //̌c3.uc_link==NULLȂ̂ŏIB
}
void app_main(){
 //'uc_link'ݒ肷
 getcontext(&c2);//makecontext()̑Ogetcontext()ĂԂBP/ECEł͏ȗ\`ĂłɂB
 c2.uc_link=&c3;
 c2.uc_stack.ss_sp=stack;
 c2.uc_stack.ss_size=sizeof stack;
 makecontext(&c2,func2,2,"num=%d",5);
 //'uc_link'ݒ肵Ȃ
 getcontext(&c3);//makecontext()̑Ogetcontext()ĂԂBP/ECEł͏ȗ\`ĂłɂB
 c3.uc_link=NULL;
 c3.uc_stack.ss_sp=FRAM4_START;//stack_switch_FRAM4()̓Ƃ鎖\B
 c3.uc_stack.ss_size=FRAM4_SIZE;//
 makecontext(&c3,func3,0);
 //c1c2(5)c1c2(4)c1c2(3)c1c2(2)c1c2(1)c1c2(0)c3ˏI
 for(;;){
  swapcontext(&c1,&c2);
  putchar('\n');
 }
}
#endif
/****************************************************************************
 *	Win64p̎ɎQlɂ
 ****************************************************************************/
#if 0
QluMSDN - x64\tgEFAK - WX^̎gpv(https://msdn.microsoft.com/ja-jp/library/9z1stfyw.aspx)
WX^̎gp

x64A[LeN`ɂ́A16̔ėpWX^(ȍ~AWX^ƌĂт܂)16̕_pXMM/YMMWX^pӂĂ܂B
volatileWX^́AĂяoŎgpꂽɓej邱ƂAĂяoɂđz肳ĂXNb`WX^łB
֐ĂяoŎgpꂽWX^̒lێɂ͔volatileWX^KvłB
gpꂽvolatileWX^̕ۑ́AĂяo悪sKv܂B

֐ĂяoŊeWX^ǂ̂悤Ɏgp邩̕\Ɏ܂B

	
	WX^							pr																	
	
	RAX			Volatile				߂lWX^															
	RCX			Volatile				1Ԗڂ̐															
	RDX			Volatile				2Ԗڂ̐															
	R8			Volatile				3Ԗڂ̐															
	R9			Volatile				4Ԗڂ̐															
	R10:R11		Volatile				KvɉāAĂяoɂĕێKv܂Bsyscall/sysret߂Ŏgp܂B						
	R12:R15		volatile				ĂяoɂĕێKv܂B												
	RDI			volatile				ĂяoɂĕێKv܂B												
	RSI			volatile				ĂяoɂĕێKv܂B												
	RBX			volatile				ĂяoɂĕێKv܂B												
	RBP			volatile				t[|C^[ƂĎgpł܂BĂяoɂĕێKv܂B							
	RSP			volatile				X^bN|C^[															
	XMM0,YMM0		Volatile				1ԖڂFPB__vectorcallgpꂽꍇ1Ԗڂ̃xN^[^B									
	XMM1,YMM1		Volatile				2ԖڂFPB__vectorcallgpꂽꍇ2Ԗڂ̃xN^[^B									
	XMM2,YMM2		Volatile				3ԖڂFPB__vectorcallgpꂽꍇ3Ԗڂ̃xN^[^B									
	XMM3,YMM3		Volatile				4ԖڂFPB__vectorcallgpꂽꍇ4Ԗڂ̃xN^[^B									
	XMM4,YMM4		Volatile				KvɉāAĂяoɂĕێKv܂B__vectorcallgpꂽꍇ5Ԗڂ̃xN^[^B			
	XMM5,YMM5		Volatile				KvɉāAĂяoɂĕێKv܂B__vectorcallgpꂽꍇ6Ԗڂ̃xN^[^B			
	XMM6:XMM15,YMM6:YMM15	volatile(XMM),volatile(YMM̏㔼)	KvɉāAĂяoɂĕێKv܂BKvɉāAĂяoɂYMMWX^ێKv܂B	
	
#endif
#if 0
QluAkihiro Notesv(http://akihironotes.blogspot.jp/)́AuMicrosoft x64 ĂяoKv(http://akihironotes.blogspot.jp/2009/04/microsoft-x64.html)̋L
Microsoft x64 ĂяoK

x86cdeclstdcall,fastcallȂǂ̌ĂяoK񂪂܂Ax64fastcall܂B



E1rcxxmm0
E2rdxxmm1
E3r8xmm2
E4r9xmm3
E5ȍ~X^bN

14ɂĂ͌^ɂĈȉ̂悤Ȓln܂B

E^A1,2,4,8oCg̍\́A__m64͔ėpWX^ɒlƂēnBgpʃrbg̓NAȂB
Efloat/doublexmmWX^ɒlƂēnBgpʃrbg̓NAȂB
ELȊO(1,2,4,8oCgȊO̍\́A__m128)͔ėpWX^Ƀ|C^ƂēnB|C^16byteŃACgĂB

߂l

E^A1,2,4,8oCg̍\́A__m64raxɒlƂĕԂB
Efloat/double/__m128xmm0ɒlƂĕԂB
ELȊO͊֐Ăяo̍ŏ̈(rcx)ƂĖ߂l̃|C^nÃAhXɖ߂l̓eRs[Braxɂ͂̃|C^(܂1Ɠl)ԂB1߂l̃|C^ƂȂ邽߁A֐̖{̈͑2ȍ~ɓnB

ĂяõX^bN

@֐Ăяo͌Ăяo32oCgȏ̃X^bN̈mۂĂcallsKv܂B
@32oCgƂ̂̓WX^nƂȂ4̈X^bNɑޔ̂ɕKvƂȂTCYłB͌Ăяo֐̎ۂ̈3ȉ̂ƂłmۂĂKv܂BmۂX^bN̒ĝ͖܂܂ō\܂B5ڈȍ~̈͂ɑ悤push܂B
@ɂ̌܂ƂcallÕX^bN|C^16oCgACĂKv܂B

	
	(X^bN)	
	
	^[AhX	
	16 byte aligned
	1(rcx)ޔp	
	
	2(rdx)ޔp	
	
	3(r8)ޔp	
	
	4(r9)ޔp	
	
	(X^bN)	
	
	4ȉ̊֐ĂяõX^bN̏

	
	(X^bN)	
	
	^[AhX	
	16 byte aligned
	1(rcx)ޔp	
	
	2(rdx)ޔp	
	
	3(r8)ޔp	
	
	4(r9)ޔp	
	
	5		
	
	(X^bN)	
	
	5̊֐ĂяõX^bN̏
#endif
#if 0
Qlu[[̏l^v(http://kirihari.net/)́AuSSE2(movdqu)gp128rbgPʂ̃ԓ]vOv(http://kirihari.net/program/memcpy.html)̋L
SSE2(movdqu)gp128rbgPʂ̃ԓ]vO

SSE2ł́A128rbgZSIMD߂̊gsĂAZłȂAmovdqaEmovdquƂ߂gƂ128rbgPʂŃԂ̃f[^Rs[ɍsƂł܂B
movdqa͓]Ώۂ̃̃AhX16oCg̋E(ACg)ɑĂKv܂(AhXl16Ŋ؂)B
ɑ΂āAmovdquł16oCg̃ACg͂ȂAǂ̃AhXlłǂݏł܂B
AACgĂ镪ACPU₷߁Amovdqa߂̕sxłB
ǋȂ΁Amallocz̊mێ16oCg̃ACgӎKv܂B
e߂̊{IȎsxIntelȉ̎oĂ܂A̎łmovdqaEmovdqũX[vbgƃCeV͓ɂȂĂ܂ˁEEEB
Ƃ͕̂słB
EIntelZpuIA-32߂̃CeVƃX[vbgv(PDF)

̎ɂMMXSSE̊g߂ڂĂ܂B
Ȃ݂ɂ蕪ƁAMMX64rbgSIMD߂̊gŁASSE128rbg̕_ZSIMDߊgƎv܂B
Ƃ肠AmovdqugăԂ̃f[^]sTv͈ȉ̂悤ɂȂ܂B
Linux+gcc̊œ܂B
srcdstɃ|C^nA]TCYsizeŎw肷邾łB
SSE2Ή̏ꍇlāAnormalcalc̕ǉĂ܂B

\[XR[h

//SSE2movdqu߂păԓ]
void memcpy_sse2(void* src, void* dst, size_t size) {
  asm("  mov     %0,%%eax          \n"
      "  lea     (%%esi,%0),%%ebx  \n"
      "ssecalc:                    \n"
      "  movdqu  (%%esi),%%xmm0    \n"
      "  lea     16(%%esi),%%esi   \n"
      "  movdqu  %%xmm0,(%%edi)    \n"
      "  lea     16(%%edi),%%edi   \n"
      "  sub     $16,%%eax         \n"
      "  cmp     $16,%%eax         \n"
      "  jge     ssecalc           \n"
      "  cmp     %%ebx,%%esi       \n"
      "  je      end               \n"
      "normalcalc:                 \n"
      "  mov     (%%esi),%%al      \n"
      "  lea     1(%%esi),%%esi    \n"
      "  mov     %%al,(%%edi)      \n"
      "  lea     1(%%edi),%%edi    \n"
      "  cmp     %%ebx,%%esi       \n"
      "  jne     normalcalc        \n"
      " end:                       \n"
      "  emms"
      : : "r"(size),"S"(src),"D"(dst));
}
#endif
