iBoot/lib/libc/arm64/bcopy.S

295 lines
11 KiB
ArmAsm

/*
* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*
* This file implements the following functions for the arm64 architecture.
*
* void bcopy(const void * source,
* void * destination,
* size_t length);
*
* void *memmove(void * destination,
* const void * source,
* size_t n);
*
* void *memcpy(void * restrict destination,
* const void * restrict source,
* size_t n);
*
* All copy n successive bytes from source to destination. Memmove and memcpy
* return destination, whereas bcopy has no return value. Copying takes place
* as if it were through a temporary buffer -- after return destination
* contains exactly the bytes from source, even if the buffers overlap (this is
* not required of memcpy by the C standard; its behavior is undefined if the
* buffers overlap, but we are holding ourselves to the historical behavior of
* this function on MacOS).
*/
#include "asm.h"
.globl _bcopy
.globl _memcpy
.globl _memmove
/*****************************************************************************
* Macros *
*****************************************************************************/
#define kSmallCopy 64
/*****************************************************************************
* Entrypoints *
*****************************************************************************/
.text
.align 5
_bcopy:
// Translate bcopy into memcpy by swapping the first and second arguments.
mov x3, x0
mov x0, x1
mov x1, x3
.align 4
_memcpy:
_memmove:
// Our preference is to copy the data in ascending address order, but if the
// buffers overlap such that the beginning of the destination buffer aliases
// the end of the source buffer, we need to copy in descending address order
// instead to preserve the memmove semantics. We detect this case with the
// test:
//
// destination - source < length (unsigned compare)
//
// If the address of the source buffer is higher than the address of the
// destination buffer, this arithmetic can overflow, but the overflowed value
// can only be smaller than length if the buffers do not overlap, so we don't
// need to worry about false positives due to the overflow (they happen, but
// only in cases where copying in either order is correct).
PUSH_FRAME
sub x3, x0, x1
cmp x3, x2
b.cc L_reverse
mov x3, x0 // copy destination pointer
cmp x2, #(kSmallCopy)
b.cc L_forwardSmallCopy
/*****************************************************************************
* Forward large copy *
*****************************************************************************/
// Load the first 32 bytes from src, and compute the number of bytes to the
// first 32-byte aligned location in dst. Even though we are going to copy
// 32 bytes, only those preceeding that 32-byte location "count" towards
// reducing the length of the buffer or advancing the pointers. We will need
// to issue the first load from the advanced src pointer BEFORE the store to
// the unmodified dst pointer.
add x3, x3, #32
and x3, x3, #-32 // aligned dst
ldp x12,x13,[x1]
ldp x14,x15,[x1, #16]
sub x5, x3, x0 // bytes between original dst and aligned dst
add x1, x1, x5 // update src pointer
// At this point, data in the following registers is in flight:
//
// x0 original dst pointer
// x1 corresponding location in src buffer.
// x2 length from aligned location in dst to end of buffer. This is
// guaranteed to be >= (64 - 32).
// x3 aligned location in dst buffer.
// x12:x15 first 32 bytes of src buffer.
//
// We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The
// store *may* overlap the first 32 bytes of the load, so in order to get
// correct memmove semantics, the first 32 byte load must occur before the
// store.
//
// After loading these 32 bytes, we advance x1, and decrement the length by
// 64. If the remaining length of the buffer was less than 64, then we jump
// directly to the cleanup path.
ldp x8, x9, [x1]
ldp x10,x11,[x1, #16]
add x1, x1, #32
sub x2, x2, x5 // update length
stp x12,x13,[x0] // initial unaligned store
stp x14,x15,[x0, #16] // initial unaligned store
subs x2, x2, #64
b.ls L_forwardCleanup
L_forwardCopyLoop:
// Main copy loop:
//
// 1. store the 32 bytes loaded in the previous loop iteration
// 2. advance the destination pointer
// 3. load the next 32 bytes
// 4. advance the source pointer
// 5. subtract 32 from the length
//
// The loop is terminated when 32 or fewer bytes remain to be loaded. Those
// trailing 1-32 bytes will be copied in the loop cleanup.
stnp x8, x9, [x3]
stnp x10,x11,[x3, #16]
add x3, x3, #32
ldnp x8, x9, [x1]
ldnp x10,x11,[x1, #16]
add x1, x1, #32
subs x2, x2, #32
b.hi L_forwardCopyLoop
L_forwardCleanup:
// There are 32 bytes in x8-x11 that were loaded in the previous loop
// iteration, which need to be stored to [x3,x3+32). In addition, between
// 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact
// number of bytes to copy is x2 + 32. Instead of using smaller conditional
// copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
// This copy may overlap with the first store, so the loads must come before
// the store of the data from the previous loop iteration.
add x1, x1, x2
ldp x12,x13,[x1]
ldp x14,x15,[x1, #16]
stp x8, x9, [x3]
stp x10,x11,[x3, #16]
add x3, x3, x2
stp x12,x13,[x3, #32]
stp x14,x15,[x3, #48]
POP_FRAME
ret
/*****************************************************************************
* forward small copy *
*****************************************************************************/
// Copy one quadword at a time until less than 8 bytes remain to be copied.
// At the point of entry to L_forwardSmallCopy, the "calling convention"
// is as follows:
//
// x0 pointer to first byte of destination
// x1 pointer to first byte of source
// x2 length of buffers
// x3 pointer to first byte of destination
0: ldr x6, [x1],#8
str x6, [x3],#8
L_forwardSmallCopy:
subs x2, x2, #8
b.cs 0b
adds x2, x2, #8
b.eq 2f
1: ldrb w6, [x1],#1
strb w6, [x3],#1
subs x2, x2, #1
b.ne 1b
2: POP_FRAME
ret
/*****************************************************************************
* Reverse copy engines *
*****************************************************************************/
// The reverse copy engines are identical in every way to the forward copy
// engines, except in that they do everything backwards. For this reason, they
// are somewhat more sparsely commented than the forward copy loops. I have
// tried to only comment things that might be somewhat surprising in how they
// differ from the forward implementation.
//
// The one important thing to note is that (almost without fail), x1 and x3
// will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
// throughout these copy loops. They are initially advanced to that position
// in the L_reverse jump island. Because of this, whereas the forward copy
// loops generally follow a "copy data, then advance pointers" scheme, in the
// reverse copy loops, we advance the pointers, then copy the data.
L_reverse:
// As a minor optimization, we early out if dst == src.
cbz x3, L_return
// advance both pointers to the ends of their respective buffers before
// jumping into the appropriate reverse copy loop.
add x4, x0, x2
add x1, x1, x2
cmp x2, #(kSmallCopy)
b.cc L_reverseSmallCopy
/*****************************************************************************
* Reverse large copy *
*****************************************************************************/
ldp x12,x13,[x1, #-16]
ldp x14,x15,[x1, #-32]
sub x3, x4, #1 // In the forward copy, we used dst+32 & -32
and x3, x3, #-32 // to find an aligned location in the dest
sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead,
sub x1, x1, x5 // because we are going backwards.
sub x2, x2, x5
ldp x8, x9, [x1, #-16]
ldp x10,x11,[x1, #-32]
stp x12,x13,[x4, #-16]
stp x14,x15,[x4, #-32]
sub x1, x1, #32
subs x2, x2, #64
b.ls L_reverseCleanup
L_reverseCopyLoop:
stnp x8, x9, [x3, #-16]
stnp x10,x11,[x3, #-32]
sub x3, x3, #32
ldnp x8, x9, [x1, #-16]
ldnp x10,x11,[x1, #-32]
sub x1, x1, #32
subs x2, x2, #32
b.hi L_reverseCopyLoop
L_reverseCleanup:
sub x1, x1, x2
ldp x12,x13,[x1, #-16]
ldp x14,x15,[x1, #-32]
stp x8, x9, [x3, #-16]
stp x10,x11,[x3, #-32]
stp x12,x13,[x0, #16] // In the forward copy, we need to compute the
stp x14,x15,[x0] // address of these stores, but here we already
POP_FRAME // have a pointer to the start of the buffer.
ret
/*****************************************************************************
* reverse small copy *
*****************************************************************************/
0: ldr x6, [x1,#-8]!
str x6, [x4,#-8]!
L_reverseSmallCopy:
subs x2, x2, #8
b.cs 0b
adds x2, x2, #8
b.eq 2f
1: ldrb w6, [x1,#-1]!
strb w6, [x4,#-1]!
subs x2, x2, #1
b.ne 1b
2: POP_FRAME
ret
L_return:
POP_FRAME
ret