UID 1
精华
积分 76361
威望 点
宅币 个
贡献 次
宅之契约 份
最后登录 1970-1-1
在线时间 小时
曾有人在QQ群里发了一个代码片段,自己实现memcpy的功能,然后内容大概如下:void COPY(void *dest, void *src, size_t len)
{
char *cdest = dest;
char *csrc = src;
size_t i;
for(i = 0; i < len; i++)
cdest[i] = csrc[i];
} 复制代码 代码非常简单直白,通俗易懂,但我们真的有必要造个这样的轮子吗?先不说别的,memcpy真的就是这样实现的吗?
我们来看一下VS2012的memcpy的反汇编。这里我随便搞了个工程然后随便打了些代码,要的就是在执行memcpy的时候戳个断点进去看。
嗯可以看到VS2012的memcpy是拿汇编写的而且我们可以看到源码。我把它复制出来,详细的解释在后面,这里只做参考,可掠过。--- f:\dd\vctools\crt_bld\SELF_X86\crt\src\INTEL\memcpy.asm --------------------
dst:ptr byte, \
src:ptr byte, \
count:IWORD
; destination pointer
; source pointer
; number of bytes to copy
OPTION PROLOGUE:NONE, EPILOGUE:NONE
push edi ;U - save edi
0012B090 57 push edi
push esi ;V - save esi
0012B091 56 push esi
; size param/4 prolog byte #reg saved
.FPO ( 0, 3 , $-_MEM_ , 2, 0, 0 )
mov esi,[esp + 010h] ;U - esi = source
0012B092 8B 74 24 10 mov esi,dword ptr [esp+10h]
mov ecx,[esp + 014h] ;V - ecx = number of bytes to move
0012B096 8B 4C 24 14 mov ecx,dword ptr [esp+14h]
mov edi,[esp + 0Ch] ;U - edi = dest
0012B09A 8B 7C 24 0C mov edi,dword ptr [esp+0Ch]
;
; Check for overlapping buffers:
; If (dst <= src) Or (dst >= src + Count) Then
; Do normal (Upwards) Copy
; Else
; Do Downwards Copy to avoid propagation
;
mov eax,ecx ;V - eax = byte count...
0012B09E 8B C1 mov eax,ecx
mov edx,ecx ;U - edx = byte count...
0012B0A0 8B D1 mov edx,ecx
add eax,esi ;V - eax = point past source end
0012B0A2 03 C6 add eax,esi
cmp edi,esi ;U - dst <= src ?
0012B0A4 3B FE cmp edi,esi
jbe short CopyUp ;V - yes, copy toward higher addresses
0012B0A6 76 08 jbe CopyUp (012B0B0h)
cmp edi,eax ;U - dst < (src + count) ?
0012B0A8 3B F8 cmp edi,eax
jb CopyDown ;V - yes, copy toward lower addresses
0012B0AA 0F 82 68 03 00 00 jb TrailUpVec+50h (012B418h)
;
; Copy toward higher addresses.
;
CopyUp:
;
; See if Enhanced Fast Strings is supported.
; ENFSTRG supported?
bt __favor, __FAVOR_ENFSTRG
0012B0B0 0F BA 25 FC 63 17 00 01 bt dword ptr ds:[1763FCh],1
jnc CopyUpSSE2Check ; no jump
0012B0B8 73 07 jae CopyUp+11h (012B0C1h)
;
; use Enhanced Fast Strings
rep movsb
0012B0BA F3 A4 rep movs byte ptr es:[edi],byte ptr [esi]
jmp TrailUp0 ; Done
0012B0BC E9 17 03 00 00 jmp TrailUpVec+10h (012B3D8h)
CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
; block size greater than min threshold?
cmp ecx,080h
0012B0C1 81 F9 80 00 00 00 cmp ecx,80h
jb Dword_align ; length too small go use dwords
0012B0C7 0F 82 CE 01 00 00 jb CopyUp+1EBh (012B29Bh)
; alignments equal?
mov eax,edi
0012B0CD 8B C7 mov eax,edi
xor eax,esi
0012B0CF 33 C6 xor eax,esi
test eax,15
0012B0D1 A9 0F 00 00 00 test eax,0Fh
jne AtomChk ; Not aligned go check Atom
0012B0D6 75 0E jne CopyUp+36h (012B0E6h)
bt __isa_enabled, __ISA_AVAILABLE_SSE2
0012B0D8 0F BA 25 00 50 17 00 01 bt dword ptr ds:[175000h],1
jc VEC_memcpy ; yes, go SSE2 copy (params already set)
0012B0E0 0F 82 DA 04 00 00 jb TrailDownVec+5Ch (012B5C0h)
AtomChk:
; Is Atom supported?
bt __favor, __FAVOR_ATOM
0012B0E6 0F BA 25 FC 63 17 00 00 bt dword ptr ds:[1763FCh],0
jnc Dword_align ; no,jump
0012B0EE 0F 83 A7 01 00 00 jae CopyUp+1EBh (012B29Bh)
; check if dst is 4 byte aligned
test edi, 3
0012B0F4 F7 C7 03 00 00 00 test edi,3
jne CopyLeadUp
0012B0FA 0F 85 B8 01 00 00 jne CopyUp+208h (012B2B8h)
; check if src is 4 byte aligned
test esi, 3
0012B100 F7 C6 03 00 00 00 test esi,3
jne Dword_align_Ok
0012B106 0F 85 97 01 00 00 jne CopyUp+1F3h (012B2A3h)
; A software pipelining vectorized memcpy loop using PALIGN instructions
; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
bt edi, 2
0012B10C 0F BA E7 02 bt edi,2
jae PalignHead8
0012B110 73 0D jae CopyUp+6Fh (012B11Fh)
mov eax, dword ptr [esi]
0012B112 8B 06 mov eax,dword ptr [esi]
sub ecx, 4
0012B114 83 E9 04 sub ecx,4
lea esi, byte ptr [esi+4]
0012B117 8D 76 04 lea esi,[esi+4]
mov dword ptr [edi], eax
0012B11A 89 07 mov dword ptr [edi],eax
lea edi, byte ptr [edi+4]
0012B11C 8D 7F 04 lea edi,[edi+4]
PalignHead8:
bt edi, 3
0012B11F 0F BA E7 03 bt edi,3
jae PalignLoop
0012B123 73 11 jae CopyUp+86h (012B136h)
movq xmm1, qword ptr [esi]
0012B125 F3 0F 7E 0E movq xmm1,mmword ptr [esi]
sub ecx, 8
0012B129 83 E9 08 sub ecx,8
lea esi, byte ptr [esi+8]
0012B12C 8D 76 08 lea esi,[esi+8]
movq qword ptr [edi], xmm1
0012B12F 66 0F D6 0F movq mmword ptr [edi],xmm1
lea edi, byte ptr [edi+8]
0012B133 8D 7F 08 lea edi,[edi+8]
;(2) Use SSE palign loop
PalignLoop:
test esi, 7
0012B136 F7 C6 07 00 00 00 test esi,7
je MovPalign8
0012B13C 74 63 je CopyUp+0F1h (012B1A1h)
bt esi, 3
0012B13E 0F BA E6 03 bt esi,3
jae MovPalign4
0012B142 0F 83 B2 00 00 00 jae CopyUp+14Ah (012B1FAh)
PALIGN_memcpy 12
0012B148 66 0F 6F 4E F4 movdqa xmm1,xmmword ptr [esi-0Ch]
0012B14D 8D 76 F4 lea esi,[esi-0Ch]
PalignLoop12:
0012B150 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B155 83 E9 30 sub ecx,30h
0012B158 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B15D 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B162 8D 76 30 lea esi,[esi+30h]
0012B165 83 F9 30 cmp ecx,30h
0012B168 66 0F 6F D3 movdqa xmm2,xmm3
0012B16C 66 0F 3A 0F D9 0C palignr xmm3,xmm1,0Ch
0012B172 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B176 66 0F 6F E0 movdqa xmm4,xmm0
0012B17A 66 0F 3A 0F C2 0C palignr xmm0,xmm2,0Ch
0012B180 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B185 66 0F 6F CD movdqa xmm1,xmm5
0012B189 66 0F 3A 0F EC 0C palignr xmm5,xmm4,0Ch
0012B18F 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B194 8D 7F 30 lea edi,[edi+30h]
0012B197 7D B7 jge CopyUp+0A0h (012B150h)
0012B199 8D 76 0C lea esi,[esi+0Ch]
jmp PalignTail
0012B19C E9 AF 00 00 00 jmp CopyUp+1A0h (012B250h)
PALIGN_memcpy 8
0012B1A1 66 0F 6F 4E F8 movdqa xmm1,xmmword ptr [esi-8]
0012B1A6 8D 76 F8 lea esi,[esi-8]
0012B1A9 8D 49 00 lea ecx,[ecx]
PalignLoop8:
0012B1AC 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B1B1 83 E9 30 sub ecx,30h
0012B1B4 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B1B9 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B1BE 8D 76 30 lea esi,[esi+30h]
0012B1C1 83 F9 30 cmp ecx,30h
0012B1C4 66 0F 6F D3 movdqa xmm2,xmm3
0012B1C8 66 0F 3A 0F D9 08 palignr xmm3,xmm1,8
0012B1CE 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B1D2 66 0F 6F E0 movdqa xmm4,xmm0
0012B1D6 66 0F 3A 0F C2 08 palignr xmm0,xmm2,8
0012B1DC 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B1E1 66 0F 6F CD movdqa xmm1,xmm5
0012B1E5 66 0F 3A 0F EC 08 palignr xmm5,xmm4,8
0012B1EB 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B1F0 8D 7F 30 lea edi,[edi+30h]
0012B1F3 7D B7 jge CopyUp+0FCh (012B1ACh)
0012B1F5 8D 76 08 lea esi,[esi+8]
jmp PalignTail
0012B1F8 EB 56 jmp CopyUp+1A0h (012B250h)
PALIGN_memcpy 4
0012B1FA 66 0F 6F 4E FC movdqa xmm1,xmmword ptr [esi-4]
0012B1FF 8D 76 FC lea esi,[esi-4]
0012B202 8B FF mov edi,edi
PalignLoop4:
0012B204 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B209 83 E9 30 sub ecx,30h
0012B20C 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B211 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B216 8D 76 30 lea esi,[esi+30h]
0012B219 83 F9 30 cmp ecx,30h
0012B21C 66 0F 6F D3 movdqa xmm2,xmm3
0012B220 66 0F 3A 0F D9 04 palignr xmm3,xmm1,4
0012B226 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B22A 66 0F 6F E0 movdqa xmm4,xmm0
0012B22E 66 0F 3A 0F C2 04 palignr xmm0,xmm2,4
0012B234 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B239 66 0F 6F CD movdqa xmm1,xmm5
0012B23D 66 0F 3A 0F EC 04 palignr xmm5,xmm4,4
0012B243 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B248 8D 7F 30 lea edi,[edi+30h]
0012B24B 7D B7 jge CopyUp+154h (012B204h)
0012B24D 8D 76 04 lea esi,[esi+4]
;(3) Copy the tailing bytes.
PalignTail:
cmp ecx,10h
0012B250 83 F9 10 cmp ecx,10h
jl PalignTail4
0012B253 7C 13 jl CopyUp+1B8h (012B268h)
movdqu xmm1,xmmword ptr [esi]
0012B255 F3 0F 6F 0E movdqu xmm1,xmmword ptr [esi]
sub ecx, 10h
0012B259 83 E9 10 sub ecx,10h
lea esi, xmmword ptr [esi+10h]
0012B25C 8D 76 10 lea esi,[esi+10h]
movdqa xmmword ptr [edi],xmm1
0012B25F 66 0F 7F 0F movdqa xmmword ptr [edi],xmm1
lea edi, xmmword ptr [edi+10h]
0012B263 8D 7F 10 lea edi,[edi+10h]
jmp PalignTail
0012B266 EB E8 jmp CopyUp+1A0h (012B250h)
PalignTail4:
bt ecx, 2
0012B268 0F BA E1 02 bt ecx,2
jae PalignTail8
0012B26C 73 0D jae CopyUp+1CBh (012B27Bh)
mov eax, dword ptr [esi]
0012B26E 8B 06 mov eax,dword ptr [esi]
sub ecx,4
0012B270 83 E9 04 sub ecx,4
lea esi, byte ptr [esi+4]
0012B273 8D 76 04 lea esi,[esi+4]
mov dword ptr [edi], eax
0012B276 89 07 mov dword ptr [edi],eax
lea edi, byte ptr [edi+4]
0012B278 8D 7F 04 lea edi,[edi+4]
PalignTail8:
bt ecx, 3
0012B27B 0F BA E1 03 bt ecx,3
jae PalignTailLE3
0012B27F 73 11 jae CopyUp+1E2h (012B292h)
movq xmm1, qword ptr [esi]
0012B281 F3 0F 7E 0E movq xmm1,mmword ptr [esi]
sub ecx,8
0012B285 83 E9 08 sub ecx,8
lea esi, byte ptr [esi+8]
0012B288 8D 76 08 lea esi,[esi+8]
movq qword ptr [edi], xmm1
0012B28B 66 0F D6 0F movq mmword ptr [edi],xmm1
lea edi, byte ptr [edi+8]
0012B28F 8D 7F 08 lea edi,[edi+8]
PalignTailLE3:
mov eax, dword ptr TrailUpVec[ecx*4]
0012B292 8B 04 8D C8 B3 12 00 mov eax,dword ptr [ecx*4+12B3C8h]
jmp eax
0012B299 FF E0 jmp eax
; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination. This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;
Dword_align:
test edi,11b ;U - destination dword aligned?
0012B29B F7 C7 03 00 00 00 test edi,3
jnz short CopyLeadUp ;V - if we are not dword aligned already, align
0012B2A1 75 15 jne CopyUp+208h (012B2B8h)
Dword_align_Ok:
shr ecx,2 ;U - shift down to dword count
0012B2A3 C1 E9 02 shr ecx,2
and edx,11b ;V - trailing byte count
0012B2A6 83 E2 03 and edx,3
cmp ecx,8 ;U - test if small enough for unwind copy
0012B2A9 83 F9 08 cmp ecx,8
jb short CopyUnwindUp ;V - if so, then jump
0012B2AC 72 2A jb CopyUp+228h (012B2D8h)
rep movsd ;N - move all of our dwords
0012B2AE F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
0012B2B0 FF 24 95 C8 B3 12 00 jmp dword ptr [edx*4+12B3C8h]
0012B2B7 90 nop
;
; Code to do optimal memory copies for non-dword-aligned destinations.
;
; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
; alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
; be faster to move the bytes with one instruction.
;
align @WordSize
CopyLeadUp:
mov eax,edi ;U - get destination offset
0012B2B8 8B C7 mov eax,edi
mov edx,11b ;V - prepare for mask
0012B2BA BA 03 00 00 00 mov edx,3
sub ecx,4 ;U - check for really short string - sub for adjust
0012B2BF 83 E9 04 sub ecx,4
jb short ByteCopyUp ;V - branch to just copy bytes
0012B2C2 72 0C jb CopyUp+220h (012B2D0h)
and eax,11b ;U - get offset within first dword
0012B2C4 83 E0 03 and eax,3
add ecx,eax ;V - update size after leading bytes copied
0012B2C7 03 C8 add ecx,eax
jmp dword ptr LeadUpVec[eax*4-4] ;N - process leading bytes
0012B2C9 FF 24 85 DC B2 12 00 jmp dword ptr [eax*4+12B2DCh]
align @WordSize
ByteCopyUp:
jmp dword ptr TrailUpVec[ecx*4+16] ;N - process just bytes
0012B2D0 FF 24 8D D8 B3 12 00 jmp dword ptr [ecx*4+12B3D8h]
0012B2D7 90 nop
align @WordSize
CopyUnwindUp:
jmp dword ptr UnwindUpVec[ecx*4] ;N - unwind dword copy
0012B2D8 FF 24 8D 5C B3 12 00 jmp dword ptr [ecx*4+12B35Ch]
0012B2DF 90 nop
--- 无源文件 -----------------------------------------------------------------------
0012B2E0 EC in al,dx
0012B2E1 B2 12 mov dl,12h
0012B2E3 00 18 add byte ptr [eax],bl
0012B2E5 B3 12 mov bl,12h
0012B2E7 00 3C B3 add byte ptr [ebx+esi*4],bh
0012B2EA 12 00 adc al,byte ptr [eax]
LeadUp1:
0012B2EC 23 D1 and edx,ecx
0012B2EE 8A 06 mov al,byte ptr [esi]
0012B2F0 88 07 mov byte ptr [edi],al
0012B2F2 8A 46 01 mov al,byte ptr [esi+1]
0012B2F5 88 47 01 mov byte ptr [edi+1],al
0012B2F8 8A 46 02 mov al,byte ptr [esi+2]
0012B2FB C1 E9 02 shr ecx,2
0012B2FE 88 47 02 mov byte ptr [edi+2],al
0012B301 83 C6 03 add esi,3
0012B304 83 C7 03 add edi,3
0012B307 83 F9 08 cmp ecx,8
0012B30A 72 CC jb CopyUp+228h (012B2D8h)
0012B30C F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B30E FF 24 95 C8 B3 12 00 jmp dword ptr [edx*4+12B3C8h]
0012B315 8D 49 00 lea ecx,[ecx]
LeadUp2:
0012B318 23 D1 and edx,ecx
0012B31A 8A 06 mov al,byte ptr [esi]
0012B31C 88 07 mov byte ptr [edi],al
0012B31E 8A 46 01 mov al,byte ptr [esi+1]
0012B321 C1 E9 02 shr ecx,2
0012B324 88 47 01 mov byte ptr [edi+1],al
0012B327 83 C6 02 add esi,2
0012B32A 83 C7 02 add edi,2
0012B32D 83 F9 08 cmp ecx,8
0012B330 72 A6 jb CopyUp+228h (012B2D8h)
0012B332 F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B334 FF 24 95 C8 B3 12 00 jmp dword ptr [edx*4+12B3C8h]
0012B33B 90 nop
LeadUp3:
0012B33C 23 D1 and edx,ecx
0012B33E 8A 06 mov al,byte ptr [esi]
0012B340 88 07 mov byte ptr [edi],al
0012B342 83 C6 01 add esi,1
0012B345 C1 E9 02 shr ecx,2
0012B348 83 C7 01 add edi,1
0012B34B 83 F9 08 cmp ecx,8
0012B34E 72 88 jb CopyUp+228h (012B2D8h)
0012B350 F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B352 FF 24 95 C8 B3 12 00 jmp dword ptr [edx*4+12B3C8h]
0012B359 8D 49 00 lea ecx,[ecx]
0012B35C BF B3 12 00 AC mov edi,0AC0012B3h
0012B361 B3 12 mov bl,12h
0012B363 00 A4 B3 12 00 9C B3 add byte ptr [ebx+esi*4-4C63FFEEh],ah
0012B36A 12 00 adc al,byte ptr [eax]
0012B36C 94 xchg eax,esp
0012B36D B3 12 mov bl,12h
0012B36F 00 8C B3 12 00 84 B3 add byte ptr [ebx+esi*4-4C7BFFEEh],cl
0012B376 12 00 adc al,byte ptr [eax]
0012B378 7C B3 jl LeadUpVec+4Dh (012B32Dh)
0012B37A 12 00 adc al,byte ptr [eax]
UnwindUp7:
0012B37C 8B 44 8E E4 mov eax,dword ptr [esi+ecx*4-1Ch]
0012B380 89 44 8F E4 mov dword ptr [edi+ecx*4-1Ch],eax
UnwindUp6:
0012B384 8B 44 8E E8 mov eax,dword ptr [esi+ecx*4-18h]
0012B388 89 44 8F E8 mov dword ptr [edi+ecx*4-18h],eax
UnwindUp5:
0012B38C 8B 44 8E EC mov eax,dword ptr [esi+ecx*4-14h]
0012B390 89 44 8F EC mov dword ptr [edi+ecx*4-14h],eax
UnwindUp4:
0012B394 8B 44 8E F0 mov eax,dword ptr [esi+ecx*4-10h]
0012B398 89 44 8F F0 mov dword ptr [edi+ecx*4-10h],eax
UnwindUp3:
0012B39C 8B 44 8E F4 mov eax,dword ptr [esi+ecx*4-0Ch]
0012B3A0 89 44 8F F4 mov dword ptr [edi+ecx*4-0Ch],eax
UnwindUp2:
0012B3A4 8B 44 8E F8 mov eax,dword ptr [esi+ecx*4-8]
0012B3A8 89 44 8F F8 mov dword ptr [edi+ecx*4-8],eax
UnwindUp1:
0012B3AC 8B 44 8E FC mov eax,dword ptr [esi+ecx*4-4]
0012B3B0 89 44 8F FC mov dword ptr [edi+ecx*4-4],eax
0012B3B4 8D 04 8D 00 00 00 00 lea eax,[ecx*4]
0012B3BB 03 F0 add esi,eax
0012B3BD 03 F8 add edi,eax
UnwindUp0:
0012B3BF FF 24 95 C8 B3 12 00 jmp dword ptr [edx*4+12B3C8h]
0012B3C6 8B FF mov edi,edi
0012B3C8 D8 B3 12 00 E0 B3 fdiv dword ptr [ebx-4C1FFFEEh]
0012B3CE 12 00 adc al,byte ptr [eax]
0012B3D0 EC in al,dx
0012B3D1 B3 12 mov bl,12h
0012B3D3 00 00 add byte ptr [eax],al
0012B3D5 B4 12 mov ah,12h
0012B3D7 00 8B 44 24 0C 5E add byte ptr [ebx+5E0C2444h],cl
0012B3DD 5F pop edi
0012B3DE C3 ret
0012B3DF 90 nop
TrailUp1:
0012B3E0 8A 06 mov al,byte ptr [esi]
0012B3E2 88 07 mov byte ptr [edi],al
0012B3E4 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B3E8 5E pop esi
0012B3E9 5F pop edi
0012B3EA C3 ret
0012B3EB 90 nop
TrailUp2:
0012B3EC 8A 06 mov al,byte ptr [esi]
0012B3EE 88 07 mov byte ptr [edi],al
0012B3F0 8A 46 01 mov al,byte ptr [esi+1]
0012B3F3 88 47 01 mov byte ptr [edi+1],al
0012B3F6 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B3FA 5E pop esi
0012B3FB 5F pop edi
0012B3FC C3 ret
0012B3FD 8D 49 00 lea ecx,[ecx]
TrailUp3:
0012B400 8A 06 mov al,byte ptr [esi]
0012B402 88 07 mov byte ptr [edi],al
0012B404 8A 46 01 mov al,byte ptr [esi+1]
0012B407 88 47 01 mov byte ptr [edi+1],al
0012B40A 8A 46 02 mov al,byte ptr [esi+2]
0012B40D 88 47 02 mov byte ptr [edi+2],al
0012B410 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B414 5E pop esi
0012B415 5F pop edi
0012B416 C3 ret
0012B417 90 nop
CopyDown:
0012B418 8D 74 31 FC lea esi,[ecx+esi-4]
0012B41C 8D 7C 39 FC lea edi,[ecx+edi-4]
0012B420 F7 C7 03 00 00 00 test edi,3
0012B426 75 24 jne TrailUpVec+84h (012B44Ch)
0012B428 C1 E9 02 shr ecx,2
0012B42B 83 E2 03 and edx,3
0012B42E 83 F9 08 cmp ecx,8
0012B431 72 0D jb TrailUpVec+78h (012B440h)
0012B433 FD std
0012B434 F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B436 FC cld
0012B437 FF 24 95 64 B5 12 00 jmp dword ptr [edx*4+12B564h]
0012B43E 8B FF mov edi,edi
CopyUnwindDown:
0012B440 F7 D9 neg ecx
0012B442 FF 24 8D 14 B5 12 00 jmp dword ptr [ecx*4+12B514h]
0012B449 8D 49 00 lea ecx,[ecx]
CopyLeadDown:
0012B44C 8B C7 mov eax,edi
0012B44E BA 03 00 00 00 mov edx,3
0012B453 83 F9 04 cmp ecx,4
0012B456 72 0C jb TrailUpVec+9Ch (012B464h)
0012B458 83 E0 03 and eax,3
0012B45B 2B C8 sub ecx,eax
0012B45D FF 24 85 68 B4 12 00 jmp dword ptr [eax*4+12B468h]
ByteCopyDown:
0012B464 FF 24 8D 64 B5 12 00 jmp dword ptr [ecx*4+12B564h]
0012B46B 90 nop
0012B46C 78 B4 js TrailUpVec+5Ah (012B422h)
0012B46E 12 00 adc al,byte ptr [eax]
0012B470 9C pushfd
0012B471 B4 12 mov ah,12h
0012B473 00 C4 add ah,al
0012B475 B4 12 mov ah,12h
0012B477 00 8A 46 03 23 D1 add byte ptr [edx-2EDCFCBAh],cl
0012B47D 88 47 03 mov byte ptr [edi+3],al
0012B480 83 EE 01 sub esi,1
0012B483 C1 E9 02 shr ecx,2
0012B486 83 EF 01 sub edi,1
0012B489 83 F9 08 cmp ecx,8
0012B48C 72 B2 jb TrailUpVec+78h (012B440h)
0012B48E FD std
0012B48F F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B491 FC cld
0012B492 FF 24 95 64 B5 12 00 jmp dword ptr [edx*4+12B564h]
0012B499 8D 49 00 lea ecx,[ecx]
LeadDown2:
0012B49C 8A 46 03 mov al,byte ptr [esi+3]
0012B49F 23 D1 and edx,ecx
0012B4A1 88 47 03 mov byte ptr [edi+3],al
0012B4A4 8A 46 02 mov al,byte ptr [esi+2]
0012B4A7 C1 E9 02 shr ecx,2
0012B4AA 88 47 02 mov byte ptr [edi+2],al
0012B4AD 83 EE 02 sub esi,2
0012B4B0 83 EF 02 sub edi,2
0012B4B3 83 F9 08 cmp ecx,8
0012B4B6 72 88 jb TrailUpVec+78h (012B440h)
0012B4B8 FD std
0012B4B9 F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B4BB FC cld
0012B4BC FF 24 95 64 B5 12 00 jmp dword ptr [edx*4+12B564h]
0012B4C3 90 nop
LeadDown3:
0012B4C4 8A 46 03 mov al,byte ptr [esi+3]
0012B4C7 23 D1 and edx,ecx
0012B4C9 88 47 03 mov byte ptr [edi+3],al
0012B4CC 8A 46 02 mov al,byte ptr [esi+2]
0012B4CF 88 47 02 mov byte ptr [edi+2],al
0012B4D2 8A 46 01 mov al,byte ptr [esi+1]
0012B4D5 C1 E9 02 shr ecx,2
0012B4D8 88 47 01 mov byte ptr [edi+1],al
0012B4DB 83 EE 03 sub esi,3
0012B4DE 83 EF 03 sub edi,3
0012B4E1 83 F9 08 cmp ecx,8
0012B4E4 0F 82 56 FF FF FF jb TrailUpVec+78h (012B440h)
0012B4EA FD std
0012B4EB F3 A5 rep movs dword ptr es:[edi],dword ptr [esi]
0012B4ED FC cld
0012B4EE FF 24 95 64 B5 12 00 jmp dword ptr [edx*4+12B564h]
0012B4F5 8D 49 00 lea ecx,[ecx]
0012B4F8 18 B5 12 00 20 B5 sbb byte ptr [ebp-4ADFFFEEh],dh
0012B4FE 12 00 adc al,byte ptr [eax]
0012B500 28 B5 12 00 30 B5 sub byte ptr [ebp-4ACFFFEEh],dh
0012B506 12 00 adc al,byte ptr [eax]
0012B508 38 B5 12 00 40 B5 cmp byte ptr [ebp-4ABFFFEEh],dh
0012B50E 12 00 adc al,byte ptr [eax]
0012B510 48 dec eax
0012B511 B5 12 mov ch,12h
0012B513 00 5B B5 add byte ptr [ebx-4Bh],bl
0012B516 12 00 adc al,byte ptr [eax]
UnwindDown7:
0012B518 8B 44 8E 1C mov eax,dword ptr [esi+ecx*4+1Ch]
0012B51C 89 44 8F 1C mov dword ptr [edi+ecx*4+1Ch],eax
UnwindDown6:
0012B520 8B 44 8E 18 mov eax,dword ptr [esi+ecx*4+18h]
0012B524 89 44 8F 18 mov dword ptr [edi+ecx*4+18h],eax
UnwindDown5:
0012B528 8B 44 8E 14 mov eax,dword ptr [esi+ecx*4+14h]
0012B52C 89 44 8F 14 mov dword ptr [edi+ecx*4+14h],eax
UnwindDown4:
0012B530 8B 44 8E 10 mov eax,dword ptr [esi+ecx*4+10h]
0012B534 89 44 8F 10 mov dword ptr [edi+ecx*4+10h],eax
UnwindDown3:
0012B538 8B 44 8E 0C mov eax,dword ptr [esi+ecx*4+0Ch]
0012B53C 89 44 8F 0C mov dword ptr [edi+ecx*4+0Ch],eax
UnwindDown2:
0012B540 8B 44 8E 08 mov eax,dword ptr [esi+ecx*4+8]
0012B544 89 44 8F 08 mov dword ptr [edi+ecx*4+8],eax
UnwindDown1:
0012B548 8B 44 8E 04 mov eax,dword ptr [esi+ecx*4+4]
0012B54C 89 44 8F 04 mov dword ptr [edi+ecx*4+4],eax
0012B550 8D 04 8D 00 00 00 00 lea eax,[ecx*4]
0012B557 03 F0 add esi,eax
0012B559 03 F8 add edi,eax
UnwindDown0:
0012B55B FF 24 95 64 B5 12 00 jmp dword ptr [edx*4+12B564h]
0012B562 8B FF mov edi,edi
0012B564 74 B5 je UnwindDownVec+23h (012B51Bh)
0012B566 12 00 adc al,byte ptr [eax]
0012B568 7C B5 jl UnwindDownVec+27h (012B51Fh)
0012B56A 12 00 adc al,byte ptr [eax]
0012B56C 8C B5 12 00 A0 B5 mov word ptr [ebp-4A5FFFEEh],st(-2)
0012B572 12 00 adc al,byte ptr [eax]
TrailDown0:
0012B574 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B578 5E pop esi
0012B579 5F pop edi
0012B57A C3 ret
0012B57B 90 nop
TrailDown1:
0012B57C 8A 46 03 mov al,byte ptr [esi+3]
0012B57F 88 47 03 mov byte ptr [edi+3],al
0012B582 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B586 5E pop esi
0012B587 5F pop edi
0012B588 C3 ret
0012B589 8D 49 00 lea ecx,[ecx]
TrailDown2:
0012B58C 8A 46 03 mov al,byte ptr [esi+3]
0012B58F 88 47 03 mov byte ptr [edi+3],al
0012B592 8A 46 02 mov al,byte ptr [esi+2]
0012B595 88 47 02 mov byte ptr [edi+2],al
0012B598 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B59C 5E pop esi
0012B59D 5F pop edi
0012B59E C3 ret
0012B59F 90 nop
TrailDown3:
0012B5A0 8A 46 03 mov al,byte ptr [esi+3]
0012B5A3 88 47 03 mov byte ptr [edi+3],al
0012B5A6 8A 46 02 mov al,byte ptr [esi+2]
0012B5A9 88 47 02 mov byte ptr [edi+2],al
0012B5AC 8A 46 01 mov al,byte ptr [esi+1]
0012B5AF 88 47 01 mov byte ptr [edi+1],al
0012B5B2 8B 44 24 0C mov eax,dword ptr [esp+0Ch]
0012B5B6 5E pop esi
0012B5B7 5F pop edi
0012B5B8 C3 ret
0012B5B9 8D A4 24 00 00 00 00 lea esp,[esp]
VEC_memcpy:
0012B5C0 57 push edi
0012B5C1 8B C6 mov eax,esi
0012B5C3 83 E0 0F and eax,0Fh
0012B5C6 85 C0 test eax,eax
0012B5C8 0F 85 D2 00 00 00 jne TrailDownVec+13Ch (012B6A0h)
L_Aligned:
0012B5CE 8B D1 mov edx,ecx
0012B5D0 83 E1 7F and ecx,7Fh
0012B5D3 C1 EA 07 shr edx,7
0012B5D6 74 65 je TrailDownVec+0D9h (012B63Dh)
0012B5D8 8D A4 24 00 00 00 00 lea esp,[esp]
0012B5DF 90 nop
L_1:
0012B5E0 66 0F 6F 06 movdqa xmm0,xmmword ptr [esi]
0012B5E4 66 0F 6F 4E 10 movdqa xmm1,xmmword ptr [esi+10h]
0012B5E9 66 0F 6F 56 20 movdqa xmm2,xmmword ptr [esi+20h]
0012B5EE 66 0F 6F 5E 30 movdqa xmm3,xmmword ptr [esi+30h]
0012B5F3 66 0F 7F 07 movdqa xmmword ptr [edi],xmm0
0012B5F7 66 0F 7F 4F 10 movdqa xmmword ptr [edi+10h],xmm1
0012B5FC 66 0F 7F 57 20 movdqa xmmword ptr [edi+20h],xmm2
0012B601 66 0F 7F 5F 30 movdqa xmmword ptr [edi+30h],xmm3
0012B606 66 0F 6F 66 40 movdqa xmm4,xmmword ptr [esi+40h]
0012B60B 66 0F 6F 6E 50 movdqa xmm5,xmmword ptr [esi+50h]
0012B610 66 0F 6F 76 60 movdqa xmm6,xmmword ptr [esi+60h]
0012B615 66 0F 6F 7E 70 movdqa xmm7,xmmword ptr [esi+70h]
0012B61A 66 0F 7F 67 40 movdqa xmmword ptr [edi+40h],xmm4
0012B61F 66 0F 7F 6F 50 movdqa xmmword ptr [edi+50h],xmm5
0012B624 66 0F 7F 77 60 movdqa xmmword ptr [edi+60h],xmm6
0012B629 66 0F 7F 7F 70 movdqa xmmword ptr [edi+70h],xmm7
0012B62E 8D B6 80 00 00 00 lea esi,[esi+80h]
0012B634 8D BF 80 00 00 00 lea edi,[edi+80h]
0012B63A 4A dec edx
0012B63B 75 A3 jne TrailDownVec+7Ch (012B5E0h)
L_1a:
0012B63D 85 C9 test ecx,ecx
0012B63F 74 4F je TrailDownVec+12Ch (012B690h)
0012B641 8B D1 mov edx,ecx
0012B643 C1 EA 04 shr edx,4
0012B646 85 D2 test edx,edx
0012B648 74 17 je TrailDownVec+0FDh (012B661h)
0012B64A 8D 9B 00 00 00 00 lea ebx,[ebx]
L_2:
0012B650 66 0F 6F 06 movdqa xmm0,xmmword ptr [esi]
0012B654 66 0F 7F 07 movdqa xmmword ptr [edi],xmm0
0012B658 8D 76 10 lea esi,[esi+10h]
0012B65B 8D 7F 10 lea edi,[edi+10h]
0012B65E 4A dec edx
0012B65F 75 EF jne TrailDownVec+0ECh (012B650h)
L_Trailing:
0012B661 83 E1 0F and ecx,0Fh
0012B664 74 2A je TrailDownVec+12Ch (012B690h)
0012B666 8B C1 mov eax,ecx
0012B668 C1 E9 02 shr ecx,2
0012B66B 74 0D je TrailDownVec+116h (012B67Ah)
L_TrailDword:
0012B66D 8B 16 mov edx,dword ptr [esi]
0012B66F 89 17 mov dword ptr [edi],edx
0012B671 8D 76 04 lea esi,[esi+4]
0012B674 8D 7F 04 lea edi,[edi+4]
0012B677 49 dec ecx
0012B678 75 F3 jne TrailDownVec+109h (012B66Dh)
L_TrailBytes:
0012B67A 8B C8 mov ecx,eax
0012B67C 83 E1 03 and ecx,3
0012B67F 74 0F je TrailDownVec+12Ch (012B690h)
L_TrailNextByte:
0012B681 8A 06 mov al,byte ptr [esi]
0012B683 88 07 mov byte ptr [edi],al
0012B685 46 inc esi
0012B686 47 inc edi
0012B687 49 dec ecx
0012B688 75 F7 jne TrailDownVec+11Dh (012B681h)
0012B68A 8D 9B 00 00 00 00 lea ebx,[ebx]
L_Return:
0012B690 58 pop eax
0012B691 5E pop esi
0012B692 5F pop edi
0012B693 C3 ret
0012B694 8D A4 24 00 00 00 00 lea esp,[esp]
0012B69B EB 03 jmp TrailDownVec+13Ch (012B6A0h)
0012B69D CC int 3
0012B69E CC int 3
0012B69F CC int 3
L_Notaligned:
0012B6A0 BA 10 00 00 00 mov edx,10h
0012B6A5 2B D0 sub edx,eax
0012B6A7 2B CA sub ecx,edx
0012B6A9 51 push ecx
0012B6AA 8B C2 mov eax,edx
0012B6AC 8B C8 mov ecx,eax
0012B6AE 83 E1 03 and ecx,3
0012B6B1 74 09 je TrailDownVec+158h (012B6BCh)
L_Byte:
0012B6B3 8A 16 mov dl,byte ptr [esi]
0012B6B5 88 17 mov byte ptr [edi],dl
0012B6B7 46 inc esi
0012B6B8 47 inc edi
0012B6B9 49 dec ecx
0012B6BA 75 F7 jne TrailDownVec+14Fh (012B6B3h)
L_MovDword:
0012B6BC C1 E8 02 shr eax,2
0012B6BF 74 0D je TrailDownVec+16Ah (012B6CEh)
L_Dword:
0012B6C1 8B 16 mov edx,dword ptr [esi]
0012B6C3 89 17 mov dword ptr [edi],edx
0012B6C5 8D 76 04 lea esi,[esi+4]
0012B6C8 8D 7F 04 lea edi,[edi+4]
0012B6CB 48 dec eax
0012B6CC 75 F3 jne TrailDownVec+15Dh (012B6C1h)
L_Adjustcnt:
0012B6CE 59 pop ecx
0012B6CF E9 FA FE FF FF jmp TrailDownVec+6Ah (012B5CEh) 复制代码 一个memcpy竟然有700+行,这简直比想象中的长得多,没错,确实不是简简单单地一个rep movsb就复制了内存的。它为什么这样做呢?我来分析一下。
首先开头的废话先不看,这里面有这样的几行注释:
;
; Check for overlapping buffers:
; If (dst <= src) Or (dst >= src + Count) Then
; Do normal (Upwards) Copy
; Else
; Do Downwards Copy to avoid propagation
;
我给翻译一下:检查缓冲区是否有重叠:
如果 目标在源的前面 或者 目标不在源的长度范围内 那么
做通常的(向后的)复制
否则
做向前的复制来避免错误复制(复制成片段重复的内容)(虽然括号里的内容是脑补翻译。此外注意我把“向上复制”翻译为“向后复制”,“向下复制”翻译为“向前复制”,请留意)
嗯虽说C语言规范说memmove能保证缓冲区重叠也能正常复制而memcpy则行为未定义,但VS2012还是做了保守的处理。毕竟比起性能消耗(而且反正后面是有优化的,我接下来讲),少一个坑是一个坑。
接下来从CopyUp开始看:这是正常的向后复制的代码实现。
然后,它先检测“增强快速串处理”是否支持,是的话,直接rep movsb
; See if Enhanced Fast Strings is supported.
; ENFSTRG supported?
bt __favor, __FAVOR_ENFSTRG
jnc CopyUpSSE2Check ; no jump
;
; use Enhanced Fast Strings
rep movsb
jmp TrailUp0 ; Done
嗯也就是CPU如果没有这个功能的话,它还有其它的优化手段。可以从jnc CopyUpSSE2Check这条指令看出,它会检测CPU是否支持SSE2指令集,然后使用SSE2指令集进行加速复制。
CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
; block size greater than min threshold?
cmp ecx,080h
jb Dword_align ; length too small go use dwords
; alignments equal?
mov eax,edi
xor eax,esi
test eax,15
jne AtomChk ; Not aligned go check Atom
bt __isa_enabled, __ISA_AVAILABLE_SSE2
jc VEC_memcpy ; yes, go SSE2 copy (params already set)
先判断要复制的内容是否超过128字节,没有的话,因为此时再用SSE2的加速意义不大,此时它会跳到使用REP MOVSD的地方进行复制。
然后检查源地址和目标地址是不是都是16字节对齐的,如果没有对齐的话,检查CPU是不是Atom的,如果是对齐的,检查SSE2是否可用,可用的话直接跳去用SSE2的指令。
AtomChk:
; Is Atom supported?
bt __favor, __FAVOR_ATOM
jnc Dword_align ; no,jump
; check if dst is 4 byte aligned
test edi, 3
jne CopyLeadUp
; check if src is 4 byte aligned
test esi, 3
jne Dword_align_Ok
如果不是Atom处理器的话,直接跳到使用REP MOVSD的地方进行复制。
如果是的话,检查目标是否4字节对齐,不是的话跳到CopyLeadUp(CopyLeadUp的入口处的注释写的是“用于对未对齐的数据进行复制”)
然后再检查源是否4字节对齐,不是的话跳到使用REP MOVSD的、已经判断过目标是否对齐的那个地方继续执行。
; A software pipelining vectorized memcpy loop using PALIGN instructions
; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
bt edi, 2
jae PalignHead8
mov eax, dword ptr [esi]
sub ecx, 4
lea esi, byte ptr [esi+4]
mov dword ptr [edi], eax
lea edi, byte ptr [edi+4]
此处是:一个使用PALIGN指令的软件处理管线的向量化内存循环
(1)复制开头的几个字节来让目标地址对齐到最近的16字节边界
4字节对齐 -> 12字节复制;8字节对齐 -> 8字节复制;12字节对齐->4字节复制
判断目标地址是否已经是4字节对齐了,是的话进行8字节对齐。
4字节对齐,直接mov搞定。
PalignHead8:
bt edi, 3
jae PalignLoop
movq xmm1, qword ptr [esi]
sub ecx, 8
lea esi, byte ptr [esi+8]
movq qword ptr [edi], xmm1
lea edi, byte ptr [edi+8]
8字节对齐,判断目标地址是否已经8字节对齐了,是的话直接进入复制循环。
不是的话,拿movq指令做一次8字节复制搞定。
;(2) Use SSE palign loop
PalignLoop:
test esi, 7
je MovPalign8
bt esi, 3
jae MovPalign4
这里判断源是否8字节对齐,是的话进行8字节复制;如果源是4字节对齐的,进行4字节复制,否则进行12字节复制。
PALIGN_memcpy 12
0012B148 66 0F 6F 4E F4 movdqa xmm1,xmmword ptr [esi-0Ch]
0012B14D 8D 76 F4 lea esi,[esi-0Ch]
PalignLoop12:
0012B150 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B155 83 E9 30 sub ecx,30h
0012B158 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B15D 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B162 8D 76 30 lea esi,[esi+30h]
0012B165 83 F9 30 cmp ecx,30h
0012B168 66 0F 6F D3 movdqa xmm2,xmm3
0012B16C 66 0F 3A 0F D9 0C palignr xmm3,xmm1,0Ch
0012B172 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B176 66 0F 6F E0 movdqa xmm4,xmm0
0012B17A 66 0F 3A 0F C2 0C palignr xmm0,xmm2,0Ch
0012B180 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B185 66 0F 6F CD movdqa xmm1,xmm5
0012B189 66 0F 3A 0F EC 0C palignr xmm5,xmm4,0Ch
0012B18F 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B194 8D 7F 30 lea edi,[edi+30h]
0012B197 7D B7 jge CopyUp+0A0h (012B150h)
0012B199 8D 76 0C lea esi,[esi+0Ch]
jmp PalignTail
0012B19C E9 AF 00 00 00 jmp CopyUp+1A0h (012B250h)
上面这一部分似乎没有复制方式的源码,目测masm钦定的宏干的事儿。这是12字节对齐复制。
PALIGN_memcpy 8
0012B1A1 66 0F 6F 4E F8 movdqa xmm1,xmmword ptr [esi-8]
0012B1A6 8D 76 F8 lea esi,[esi-8]
0012B1A9 8D 49 00 lea ecx,[ecx]
PalignLoop8:
0012B1AC 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B1B1 83 E9 30 sub ecx,30h
0012B1B4 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B1B9 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B1BE 8D 76 30 lea esi,[esi+30h]
0012B1C1 83 F9 30 cmp ecx,30h
0012B1C4 66 0F 6F D3 movdqa xmm2,xmm3
0012B1C8 66 0F 3A 0F D9 08 palignr xmm3,xmm1,8
0012B1CE 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B1D2 66 0F 6F E0 movdqa xmm4,xmm0
0012B1D6 66 0F 3A 0F C2 08 palignr xmm0,xmm2,8
0012B1DC 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B1E1 66 0F 6F CD movdqa xmm1,xmm5
0012B1E5 66 0F 3A 0F EC 08 palignr xmm5,xmm4,8
0012B1EB 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B1F0 8D 7F 30 lea edi,[edi+30h]
0012B1F3 7D B7 jge CopyUp+0FCh (012B1ACh)
0012B1F5 8D 76 08 lea esi,[esi+8]
jmp PalignTail
0012B1F8 EB 56 jmp CopyUp+1A0h (012B250h)
上面这里是8字节对齐的复制。
PALIGN_memcpy 4
0012B1FA 66 0F 6F 4E FC movdqa xmm1,xmmword ptr [esi-4]
0012B1FF 8D 76 FC lea esi,[esi-4]
0012B202 8B FF mov edi,edi
PalignLoop4:
0012B204 66 0F 6F 5E 10 movdqa xmm3,xmmword ptr [esi+10h]
0012B209 83 E9 30 sub ecx,30h
0012B20C 66 0F 6F 46 20 movdqa xmm0,xmmword ptr [esi+20h]
0012B211 66 0F 6F 6E 30 movdqa xmm5,xmmword ptr [esi+30h]
0012B216 8D 76 30 lea esi,[esi+30h]
0012B219 83 F9 30 cmp ecx,30h
0012B21C 66 0F 6F D3 movdqa xmm2,xmm3
0012B220 66 0F 3A 0F D9 04 palignr xmm3,xmm1,4
0012B226 66 0F 7F 1F movdqa xmmword ptr [edi],xmm3
0012B22A 66 0F 6F E0 movdqa xmm4,xmm0
0012B22E 66 0F 3A 0F C2 04 palignr xmm0,xmm2,4
0012B234 66 0F 7F 47 10 movdqa xmmword ptr [edi+10h],xmm0
0012B239 66 0F 6F CD movdqa xmm1,xmm5
0012B23D 66 0F 3A 0F EC 04 palignr xmm5,xmm4,4
0012B243 66 0F 7F 6F 20 movdqa xmmword ptr [edi+20h],xmm5
0012B248 8D 7F 30 lea edi,[edi+30h]
0012B24B 7D B7 jge CopyUp+154h (012B204h)
0012B24D 8D 76 04 lea esi,[esi+4]
上面这里是4字节对齐复制。
;(3) Copy the tailing bytes.
PalignTail:
cmp ecx,10h
jl PalignTail4
movdqu xmm1,xmmword ptr [esi]
sub ecx, 10h
lea esi, xmmword ptr [esi+10h]
movdqa xmmword ptr [edi],xmm1
lea edi, xmmword ptr [edi+10h]
jmp PalignTail
上面这里是复制结尾的字节。
PalignTail4:
bt ecx, 2
jae PalignTail8
mov eax, dword ptr [esi]
sub ecx,4
lea esi, byte ptr [esi+4]
mov dword ptr [edi], eax
lea edi, byte ptr [edi+4]
PalignTail8:
bt ecx, 3
jae PalignTailLE3
movq xmm1, qword ptr [esi]
sub ecx,8
lea esi, byte ptr [esi+8]
movq qword ptr [edi], xmm1
lea edi, byte ptr [edi+8]
PalignTailLE3:
mov eax, dword ptr TrailUpVec[ecx*4]
jmp eax
上面的代码,是根据对齐的方式做最后的处理。
; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination. This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;
Dword_align:
test edi,11b ;U - destination dword aligned?
jnz short CopyLeadUp ;V - if we are not dword aligned already, align
Dword_align_Ok:
shr ecx,2 ;U - shift down to dword count
and edx,11b ;V - trailing byte count
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec[edx*4] ;N - process trailing bytes
这里就是之前所说的使用REP MOVSD的地方进行复制。但即使是使用REP MOVSD,也要判断地址是否是对齐的,只有在对齐的状态下,才适合这样复制。不然会掉速。
;
; Code to do optimal memory copies for non-dword-aligned destinations.
;
; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
; alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
; be faster to move the bytes with one instruction.
;
align @WordSize
CopyLeadUp:
mov eax,edi ;U - get destination offset
mov edx,11b ;V - prepare for mask
sub ecx,4 ;U - check for really short string - sub for adjust
jb short ByteCopyUp ;V - branch to just copy bytes
and eax,11b ;U - get offset within first dword
add ecx,eax ;V - update size after leading bytes copied
jmp dword ptr LeadUpVec[eax*4-4] ;N - process leading bytes
align @WordSize
ByteCopyUp:
jmp dword ptr TrailUpVec[ecx*4+16] ;N - process just bytes
align @WordSize
CopyUnwindUp:
jmp dword ptr UnwindUpVec[ecx*4] ;N - unwind dword copy
此处的代码负责对没有进行4字节对齐的地址的数据的复制操作进行优化。后面的检测代码存在的原因有两个:
1、确保要复制的数据的长度超出任何可能的对齐长度
2、跳过多个复制逻辑让小的复制能更快一些
后面的部分没有源文件我也不分析了,不过值得留意的一个片段,我这里专门提一下:
L_1:
0012B5E0 66 0F 6F 06 movdqa xmm0,xmmword ptr [esi]
0012B5E4 66 0F 6F 4E 10 movdqa xmm1,xmmword ptr [esi+10h]
0012B5E9 66 0F 6F 56 20 movdqa xmm2,xmmword ptr [esi+20h]
0012B5EE 66 0F 6F 5E 30 movdqa xmm3,xmmword ptr [esi+30h]
0012B5F3 66 0F 7F 07 movdqa xmmword ptr [edi],xmm0
0012B5F7 66 0F 7F 4F 10 movdqa xmmword ptr [edi+10h],xmm1
0012B5FC 66 0F 7F 57 20 movdqa xmmword ptr [edi+20h],xmm2
0012B601 66 0F 7F 5F 30 movdqa xmmword ptr [edi+30h],xmm3
0012B606 66 0F 6F 66 40 movdqa xmm4,xmmword ptr [esi+40h]
0012B60B 66 0F 6F 6E 50 movdqa xmm5,xmmword ptr [esi+50h]
0012B610 66 0F 6F 76 60 movdqa xmm6,xmmword ptr [esi+60h]
0012B615 66 0F 6F 7E 70 movdqa xmm7,xmmword ptr [esi+70h]
0012B61A 66 0F 7F 67 40 movdqa xmmword ptr [edi+40h],xmm4
0012B61F 66 0F 7F 6F 50 movdqa xmmword ptr [edi+50h],xmm5
0012B624 66 0F 7F 77 60 movdqa xmmword ptr [edi+60h],xmm6
0012B629 66 0F 7F 7F 70 movdqa xmmword ptr [edi+70h],xmm7
这里面使用了8个SSE2寄存器进行复制操作,每个寄存器能存储16个字节,用这种方式进行复制的时候,这些movdqa指令其实有可能是并发执行的,尤其是你的内存是多通道的情况。此处进行数据量大的内容的复制的时候,效率应该会有很大的提升。
VS2012的memcpy函数,根据数据的量的大小对齐的情况,对每一种情况都进行了优化处理,各方面证明了它是无可替代的。不是什么时候,自己造的轮子都比别人的好。