void MemcpySIMD(char* pSrc, char *pDest, int Size)
{
	int loopSize = Size / 128;

	__asm {
		pushad
		mov ecx, loopSize
		mov eax, pSrc
		mov edx, pDest
		mov esi, 0

		MEMLP:
		//prefetchnta [eax+256]

		movdqa xmm0, [eax + esi]
			movdqa xmm1, [eax + esi + 16]
			movdqa xmm2, [eax + esi + 32]
			movdqa xmm3, [eax + esi + 48]
			movdqa xmm4, [eax + esi + 64]
			movdqa xmm5, [eax + esi + 80]
			movdqa xmm6, [eax + esi + 96]
			movdqa xmm7, [eax + esi + 112]

			movdqa[edx + esi], xmm0
			movdqa[edx + esi + 16], xmm1
			movdqa[edx + esi + 32], xmm2
			movdqa[edx + esi + 48], xmm3
			movdqa[edx + esi + 64], xmm4
			movdqa[edx + esi + 80], xmm5
			movdqa[edx + esi + 96], xmm6
			movdqa[edx + esi + 112], xmm7

			//add eax, 128
			add esi, 128
			dec ecx
			jnz MEMLP
			sfence
			popad

	}
}
void MemcpySIMD2(char* pSrc, char *pDest, int Size)
{
	int loopSize = Size / 128;

	__asm {
		pushad
		mov ecx, loopSize
		mov eax, pSrc
		mov edx, pDest

		MEMLP:
			movdqa xmm0, [eax]
			movdqa xmm1, [eax + 16]
			movdqa xmm2, [eax + 32]
			movdqa xmm3, [eax + 48]
			movdqa xmm4, [eax + 64]
			movdqa xmm5, [eax + 80]
			movdqa xmm6, [eax + 96]
			movdqa xmm7, [eax + 112]

			movdqa[edx], xmm0
			movdqa[edx + 16], xmm1
			movdqa[edx + 32], xmm2
			movdqa[edx + 48], xmm3
			movdqa[edx + 64], xmm4
			movdqa[edx + 80], xmm5
			movdqa[edx + 96], xmm6
			movdqa[edx + 112], xmm7

			add eax, 128
			add edx, 128
			dec ecx
			jnz MEMLP
			sfence
			popad

	}
}

위 copy 함수는 128byte 단위로 복사하는 함수로 이해를 돕기 위해 나머지 값은 버린다. 값을 쓸 때 movaps 연산자를 movntsㅗ 바꾸면 캐시를 거치지 않고 바로 메모리에 쓰게 된다.

 

void MemcpySIMD3(char* pSrc, char *pDest, int Size)
{
	int loopSize = Size / 128;

	__asm {
		pushad
		mov ecx, loopSize
		mov eax, pSrc
		mov edx, pDest

		MEMLP:
		movdqa xmm0, [eax]
			movdqa xmm1, [eax + 16]
			movdqa xmm2, [eax + 32]
			movdqa xmm3, [eax + 48]
			movdqa xmm4, [eax + 64]
			movdqa xmm5, [eax + 80]
			movdqa xmm6, [eax + 96]
			movdqa xmm7, [eax + 112]

			movntdq[edx], xmm0
			movntdq[edx + 16], xmm1
			movntdq[edx + 32], xmm2
			movntdq[edx + 48], xmm3
			movntdq[edx + 64], xmm4
			movntdq[edx + 80], xmm5
			movntdq[edx + 96], xmm6
			movntdq[edx + 112], xmm7

			add eax, 128
			add edx, 128
			dec ecx
			jnz MEMLP
			sfence
			popad
	}
}

위와 같이 구현한 뒤 memcpy 명령어와 속도를 비교해 보자.

int main(int argc, char* argv[])
{
	const int BuffSize = 300000000;
	char* pChar = new char[BuffSize];
	char* pChar2 = new char[BuffSize];

	for (int i = 0; i < BuffSize; i++)
	{
		pChar[i] = 5;
	}

	clock_t t_start, t_end;

	t_start = clock();
	MemcpySIMD(pChar, pChar2, BuffSize);
	t_end = clock();
	cout << "movdqa1 : " << t_end - t_start << endl;

	t_start = clock();
	MemcpySIMD2(pChar, pChar2, BuffSize);
	t_end = clock();
	cout << "movdqa2 : " << t_end - t_start << endl;

	t_start = clock();
	memcpy(pChar, pChar2, BuffSize);
	t_end = clock();
	cout << "memcpy : " << t_end - t_start << endl;

	t_start = clock();
	MemcpySIMD3(pChar, pChar2, BuffSize);
	t_end = clock();
	cout << "movnts : " << t_end - t_start << endl;

	return 0;
}​

 

요즘은 이미 memcpy가 캐시 처리되어 동작하고 있으므로 movnts와 크게 차이가 나지 않는다.

Posted by pi92

블로그 이미지
pi92

공지사항

Yesterday
Today
Total

달력

 « |  » 2025.5
1 2 3
4 5 6 7 8 9 10
11 12 13 14 15 16 17
18 19 20 21 22 23 24
25 26 27 28 29 30 31

최근에 올라온 글

최근에 달린 댓글

글 보관함