c 内联汇编 crc 32 算法 ...

__declspec(naked) static __cdecl getCrc32(unsigned int size, unsigned char* buffer) {
	
	__asm {
					mov edx, 4[esp]        ; - U ecx <- loop count 
					mov esi, 8[esp]        ; - V esi <- source buffer 
					
					lea edi, [crc32_table] ; - U crc32 table 
					xor ecx, ecx		   ; - V 
					
					mov eax, -1 		   ; - U 
					add esi, edx 		   ; - V 
					
					neg edx                ; - N 
					
					mov cl, al		       ; - U 
					align 16			   ; - V 
					
			main_loop:
			
					shr eax, 8         	   ; - U
					xor cl, [edx+esi]  	   ; - V  2 cycle ... 
					
					xor eax, [edi + ecx*4] ; - U 
					inc edx				   ; - V  2 cycle .. 
					
					mov cl, al			   ; - U
					jne main_loop 		   ; - V  1 cycle ... 
			imt_ret:
					xor eax, -1
					ret				
	}
}


写的时候忘记保存寄存器了 ... 这是参考的 原始的 crc32 算法做的汇编优化 不排除会比 优化过 算法的 crc32 慢 ... -_- ./....一次处理了 一个字节 想一次多处理几个字节来着 ... 发现每次都会依赖上次的的结果求值 ... 优化过程可能也会与 无法避免的 agi 冲突 ...整个循环 每次都要 读内存两次 (读crc表/缓冲区数据) .. 除此之外都是用 寄存器做的了 ...值得一提的是 sse4.2 已经从 硬件指令上支持了 crc 算法 ... http://softpixel.com/~cwright/programming/simd/sse4.php

d 语言版本
import std.digest.crc;
import std.stdio;
import std.c.stdio;

extern(C) 
uint getCrc32(uint size, ubyte* pBuffer) {			
		asm{
					naked                  ; // use naked asm mode ... 
					
					push EDI               ; //
					push ESI               ;
					
					mov EDX, 12[ESP]        ; // - U ECX <- loop count 
					mov ESI, 16[ESP]        ; // - V ESI <- source buffer 
					
					lea EDI, [crc32_table] ; // - U crc32 table 
					xor ECX, ECX		   ; // - V 
					
					mov EAX, -1 		   ; // - U 
					add ESI, EDX 		   ; // - V 
					
					neg EDX                ; // - N 
					
					mov CL, AL		       ; // - U 
					align 16			   ; // - V 
					
			main_loop:
			
					shr EAX, 8         	   ; // - U
					xor CL, [EDX+ESI]  	   ; // - V  2 cycle ... 
					
					xor EAX, [EDI + ECX*4] ; // - U 
					inc EDX				   ; // - V  2 cycle .. 
					
					mov CL, AL			   ; // - U
					jne main_loop 		   ; // - V  1 cycle ... 
					
					pop ESI                ;
					pop EDI                ;
					
					xor EAX, -1            ;
					ret					   ;
			}
}
// F7D18982
extern(C)
ulong RDTSC () {
	asm {
		naked;
		rdtsc;
		ret;
	}
}
// crc32Of 
void main () {
	
	uint [16000] p;
	
	p[0..$] = 0xFFFFFFFF;
	
	int index = 15;
while (index--){
		ulong tStart= RDTSC();
	
		uint drt[1] = cast(uint[])crc32Of(p);
	
		printf ("std time : %d hash:%x\n", cast(uint)(RDTSC() - tStart), drt[0]); 
	
		tStart= RDTSC();
	
		printf ("asm time : %d hash:%x\n\n", cast(uint)(RDTSC() - tStart), getCrc32(64000,cast(ubyte*)p)); 
	}
}


Microsoft Windows XP [版本 5.1.2600]
(C) 版权所有 1985-2001 Microsoft Corp.

D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>dmd -O -release -inline -bound
scheck=off main.d

D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>main
std time : 944818 hash:2d732d7c
asm time : 476119 hash:2d732d7c

std time : 807100 hash:2d732d7c
asm time : 483308 hash:2d732d7c

std time : 828443 hash:2d732d7c
asm time : 440832 hash:2d732d7c

std time : 787787 hash:2d732d7c
asm time : 466522 hash:2d732d7c

std time : 1089501 hash:2d732d7c
asm time : 467411 hash:2d732d7c

std time : 777931 hash:2d732d7c
asm time : 480207 hash:2d732d7c

std time : 816949 hash:2d732d7c
asm time : 463925 hash:2d732d7c

std time : 782110 hash:2d732d7c
asm time : 440370 hash:2d732d7c

std time : 802452 hash:2d732d7c
asm time : 494844 hash:2d732d7c

std time : 823711 hash:2d732d7c
asm time : 483938 hash:2d732d7c

std time : 807352 hash:2d732d7c
asm time : 440496 hash:2d732d7c

std time : 810950 hash:2d732d7c
asm time : 1092294 hash:2d732d7c

std time : 824159 hash:2d732d7c
asm time : 557011 hash:2d732d7c

std time : 983766 hash:2d732d7c
asm time : 484351 hash:2d732d7c

std time : 786254 hash:2d732d7c
asm time : 440832 hash:2d732d7c


D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>
貌似也快不了多少 ... -_- ..

后来又在vc 下测试发现还是慢了点 ...(横向对比一下 发现dmd的优化确实不咋地 ... ) 改了下 ... 应该是 addring mode 切换会多出开销的 ... 管道配对也全被打乱了 ... 发现反而更快 ... @#($(#Q&@(*@ ... ):

#include <stdio.h>
#include <windows.h>

unsigned int crc32_table [256] =
{
		0x00000000,0x77073096,0xee0e612c,0x990951ba,0x076dc419,0x706af48f,0xe963a535,
		0x9e6495a3,0x0edb8832,0x79dcb8a4,0xe0d5e91e,0x97d2d988,0x09b64c2b,0x7eb17cbd,
		0xe7b82d07,0x90bf1d91,0x1db71064,0x6ab020f2,0xf3b97148,0x84be41de,0x1adad47d,
		0x6ddde4eb,0xf4d4b551,0x83d385c7,0x136c9856,0x646ba8c0,0xfd62f97a,0x8a65c9ec,
		0x14015c4f,0x63066cd9,0xfa0f3d63,0x8d080df5,0x3b6e20c8,0x4c69105e,0xd56041e4,
		0xa2677172,0x3c03e4d1,0x4b04d447,0xd20d85fd,0xa50ab56b,0x35b5a8fa,0x42b2986c,
		0xdbbbc9d6,0xacbcf940,0x32d86ce3,0x45df5c75,0xdcd60dcf,0xabd13d59,0x26d930ac,
		0x51de003a,0xc8d75180,0xbfd06116,0x21b4f4b5,0x56b3c423,0xcfba9599,0xb8bda50f,
		0x2802b89e,0x5f058808,0xc60cd9b2,0xb10be924,0x2f6f7c87,0x58684c11,0xc1611dab,
		0xb6662d3d,0x76dc4190,0x01db7106,0x98d220bc,0xefd5102a,0x71b18589,0x06b6b51f,
		0x9fbfe4a5,0xe8b8d433,0x7807c9a2,0x0f00f934,0x9609a88e,0xe10e9818,0x7f6a0dbb,
		0x086d3d2d,0x91646c97,0xe6635c01,0x6b6b51f4,0x1c6c6162,0x856530d8,0xf262004e,
		0x6c0695ed,0x1b01a57b,0x8208f4c1,0xf50fc457,0x65b0d9c6,0x12b7e950,0x8bbeb8ea,
		0xfcb9887c,0x62dd1ddf,0x15da2d49,0x8cd37cf3,0xfbd44c65,0x4db26158,0x3ab551ce,
		0xa3bc0074,0xd4bb30e2,0x4adfa541,0x3dd895d7,0xa4d1c46d,0xd3d6f4fb,0x4369e96a,
		0x346ed9fc,0xad678846,0xda60b8d0,0x44042d73,0x33031de5,0xaa0a4c5f,0xdd0d7cc9,
		0x5005713c,0x270241aa,0xbe0b1010,0xc90c2086,0x5768b525,0x206f85b3,0xb966d409,
		0xce61e49f,0x5edef90e,0x29d9c998,0xb0d09822,0xc7d7a8b4,0x59b33d17,0x2eb40d81,
		0xb7bd5c3b,0xc0ba6cad,0xedb88320,0x9abfb3b6,0x03b6e20c,0x74b1d29a,0xead54739,
		0x9dd277af,0x04db2615,0x73dc1683,0xe3630b12,0x94643b84,0x0d6d6a3e,0x7a6a5aa8,
		0xe40ecf0b,0x9309ff9d,0x0a00ae27,0x7d079eb1,0xf00f9344,0x8708a3d2,0x1e01f268,
		0x6906c2fe,0xf762575d,0x806567cb,0x196c3671,0x6e6b06e7,0xfed41b76,0x89d32be0,
		0x10da7a5a,0x67dd4acc,0xf9b9df6f,0x8ebeeff9,0x17b7be43,0x60b08ed5,0xd6d6a3e8,
		0xa1d1937e,0x38d8c2c4,0x4fdff252,0xd1bb67f1,0xa6bc5767,0x3fb506dd,0x48b2364b,
		0xd80d2bda,0xaf0a1b4c,0x36034af6,0x41047a60,0xdf60efc3,0xa867df55,0x316e8eef,
		0x4669be79,0xcb61b38c,0xbc66831a,0x256fd2a0,0x5268e236,0xcc0c7795,0xbb0b4703,
		0x220216b9,0x5505262f,0xc5ba3bbe,0xb2bd0b28,0x2bb45a92,0x5cb36a04,0xc2d7ffa7,
		0xb5d0cf31,0x2cd99e8b,0x5bdeae1d,0x9b64c2b0,0xec63f226,0x756aa39c,0x026d930a,
		0x9c0906a9,0xeb0e363f,0x72076785,0x05005713,0x95bf4a82,0xe2b87a14,0x7bb12bae,
		0x0cb61b38,0x92d28e9b,0xe5d5be0d,0x7cdcefb7,0x0bdbdf21,0x86d3d2d4,0xf1d4e242,
		0x68ddb3f8,0x1fda836e,0x81be16cd,0xf6b9265b,0x6fb077e1,0x18b74777,0x88085ae6,
		0xff0f6a70,0x66063bca,0x11010b5c,0x8f659eff,0xf862ae69,0x616bffd3,0x166ccf45,
		0xa00ae278,0xd70dd2ee,0x4e048354,0x3903b3c2,0xa7672661,0xd06016f7,0x4969474d,
		0x3e6e77db,0xaed16a4a,0xd9d65adc,0x40df0b66,0x37d83bf0,0xa9bcae53,0xdebb9ec5,
		0x47b2cf7f,0x30b5ffe9,0xbdbdf21c,0xcabac28a,0x53b39330,0x24b4a3a6,0xbad03605,
		0xcdd70693,0x54de5729,0x23d967bf,0xb3667a2e,0xc4614ab8,0x5d681b02,0x2a6f2b94,
		0xb40bbe37,0xc30c8ea1,0x5a05df1b,0x2d02ef8d
};

__declspec(naked) static  unsigned int __cdecl asm_getCrc32(unsigned int size, unsigned char* buffer) {  

	__asm {  
		push edi
		push esi 

		mov edi, 12[esp]        ; - U ecx <- loop count   
			mov esi, 16[esp]        ; - V esi <- source buffer   

			// lea edi, [crc32_table] ; - U crc32 table   
			xor ecx, ecx           ; - V   

			mov eax, -1            ; - U   
			xor edx, edx   

			mov cl, al             ; - U   
			align 16               ; - V   

main_loop:  

		    shr eax, 8             ; - U  
			movzx edx, byte ptr[esi]
			
			inc esi
			xor ecx, edx      ; - V  2 cycle ... 

			xor eax, [crc32_table + ecx*4] ; - U   
			movzx ecx, al             ; - U  

			dec edi                ; - V  2 cycle ..
			jne main_loop          ; - V  1 cycle ...   

			pop esi
			pop edi 

		    xor eax, -1 
			ret               
	}  
}  

__declspec(naked) static unsigned __int64 __cdecl RDTSC(){
	__asm {
		rdtsc
		ret 
	}
}

DWORD getCrc32(int size, unsigned char* c)
{
	DWORD r = 0xFFFFFFFFUL;
	for (int i = 0; i < size; i++)
		r = (r >> 8) ^ crc32_table[(BYTE)r ^ *c++];

	return r ^ 0xFFFFFFFFUL;
}

void main(){
	static unsigned int m_array[320000000];
	// memset (m_array, -1, 16000000); 

	unsigned int i = 20;
	while(i--){
	unsigned __int32 tStart = timeGetTime();
	__asm {
		lea eax,[m_array]
		push eax
		push 128000000
		call dword ptr [getCrc32]
		add esp, 8
	}

	printf ("nor time : %d \n", (unsigned int)(timeGetTime() - tStart));
	tStart= timeGetTime();
	UINT hi = asm_getCrc32(128000000, (unsigned char*)&m_array);
	printf ("asm time : %d \n\n", (unsigned int)(timeGetTime() - tStart));
	}
}

vc++ 2010 release 版本结果

nor time : 551
asm time : 461

nor time : 468
asm time : 456

nor time : 501
asm time : 492

nor time : 492
asm time : 462

nor time : 513
asm time : 690

nor time : 633
asm time : 516

nor time : 475
asm time : 448

nor time : 464
asm time : 472

nor time : 460
asm time : 441

nor time : 485
asm time : 468

nor time : 483
asm time : 450

nor time : 449
asm time : 471

nor time : 512
asm time : 463

nor time : 477
asm time : 487

nor time : 478
asm time : 463

nor time : 468
asm time : 481

nor time : 462
asm time : 463

nor time : 459
asm time : 451

nor time : 484
asm time : 501

nor time : 467
asm time : 487

__declspec(naked) static  unsigned int __cdecl asm_getCrc32(unsigned int size, unsigned char* buffer) {  

	__asm {  
		    push edi
		    push esi 

			mov eax, -1            ; - U   
			mov ecx, 0xFF          ; - U   

			mov edi, 12[esp]        ; - U ecx <- loop count   
			mov esi, 16[esp]        ; - V esi <- source buffer  

			align 16               ; - V   

main_loop:  
	
		    shr eax, 8             ; - U  
			movzx edx, byte ptr[esi]
			
			inc esi
			xor ecx, edx      ; - V  2 cycle ... 

			xor eax, [crc32_table + ecx*4] ; - U   
			dec edi                ; - V  2 cycle ..   

			movzx ecx, al             ; - U  
			jne main_loop          ; - V  1 cycle ...   

			pop esi
			pop edi 

		    xor eax, -1 
			ret               
	}  
}  

猜你喜欢

转载自xuling1993728.iteye.com/blog/2203230