/* Copyright 2002 Andi Kleen */ #include #include #include /* * memcpy - Copy a memory block. * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ /* * memcpy_c() - fast string ops (REP MOVSQ) based variant. * * Calls to this get patched into the kernel image via the * alternative instructions framework: */ ALIGN memcpy_c: CFI_STARTPROC movq %rdi, %rax movl %edx, %ecx shrl $3, %ecx andl $7, %edx rep movsq movl %edx, %ecx rep movsb ret CFI_ENDPROC ENDPROC(memcpy_c) ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC /* * Put the number of full 64-byte blocks into %ecx. * Tail portion is handled at the end: */ movq %rdi, %rax movl %edx, %ecx shrl $6, %ecx jz .Lhandle_tail .p2align 4 .Lloop_64: /* * We decrement the loop index here - and the zero-flag is * checked at the end of the loop (instructions inbetween do * not change the zero flag): */ decl %ecx /* * Move in blocks of 4x16 bytes: */ movq 0*8(%rsi), %r11 movq 1*8(%rsi), %r8 movq %r11, 0*8(%rdi) movq %r8, 1*8(%rdi) movq 2*8(%rsi), %r9 movq 3*8(%rsi), %r10 movq %r9, 2*8(%rdi) movq %r10, 3*8(%rdi) movq 4*8(%rsi), %r11 movq 5*8(%rsi), %r8 movq %r11, 4*8(%rdi) movq %r8, 5*8(%rdi) movq 6*8(%rsi), %r9 movq 7*8(%rsi), %r10 movq %r9, 6*8(%rdi) movq %r10, 7*8(%rdi) leaq 64(%rsi), %rsi leaq 64(%rdi), %rdi jnz .Lloop_64 .Lhandle_tail: movl %edx, %ecx andl $63, %ecx shrl $3, %ecx jz .Lhandle_7 .p2align 4 .Lloop_8: decl %ecx movq (%rsi), %r8 movq %r8, (%rdi) leaq 8(%rdi), %rdi leaq 8(%rsi), %rsi jnz .Lloop_8 .Lhandle_7: movl %edx, %ecx andl $7, %ecx jz .Lend .p2align 4 .Lloop_1: movb (%rsi), %r8b movb %r8b, (%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 .Lend: ret CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) /* * Some CPUs run faster using the string copy instructions. * It is also a lot simpler. Use this when possible: */ .section .altinstr_replacement, "ax" 1: .byte 0xeb /* jmp */ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions, "a" .align 8 .quad memcpy .quad 1b .byte X86_FEATURE_REP_GOOD /* * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ .byte 2b - 1b .byte 2b - 1b .previous