386 lines
8.9 KiB
ArmAsm
386 lines
8.9 KiB
ArmAsm
|
/*
|
||
|
* Copyright (C) 2006 Apple Computer, Inc. All rights reserved.
|
||
|
*
|
||
|
* This document is the property of Apple Computer, Inc.
|
||
|
* It is considered confidential and proprietary.
|
||
|
*
|
||
|
* This document may not be reproduced or transmitted in any form,
|
||
|
* in whole or in part, without the express written permission of
|
||
|
* Apple Computer, Inc.
|
||
|
*/
|
||
|
.text
|
||
|
.align 2
|
||
|
|
||
|
.globl _memcpy
|
||
|
.globl _bcopy
|
||
|
.globl _memmove
|
||
|
|
||
|
_bcopy: /* void bcopy(const void *src, void *dest, size_t len); */
|
||
|
mov r3, r0
|
||
|
mov r0, r1
|
||
|
mov r1, r3
|
||
|
|
||
|
_memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */
|
||
|
_memmove: /* void *memmove(void *dest, const void *src, size_t len); */
|
||
|
/* check for zero len or if the pointers are the same */
|
||
|
cmp r2, #0
|
||
|
cmpne r0, r1
|
||
|
bxeq lr
|
||
|
|
||
|
/* save r0 (return value), r4 (scratch), and r5 (scratch) */
|
||
|
stmfd sp!, { r0, r4, r5 }
|
||
|
|
||
|
/* check for overlap. r3 <- distance between src & dest */
|
||
|
subhs r3, r0, r1
|
||
|
sublo r3, r1, r0
|
||
|
cmp r3, r2 /* if distance(src, dest) < len, we have overlap */
|
||
|
blo Loverlap
|
||
|
|
||
|
Lnormalforwardcopy:
|
||
|
/* are src and dest dissimilarly word aligned? */
|
||
|
mov r12, r0, lsl #30
|
||
|
cmp r12, r1, lsl #30
|
||
|
bne Lnonwordaligned_forward
|
||
|
|
||
|
/* if len < 64, do a quick forward copy */
|
||
|
cmp r2, #64
|
||
|
blt Lsmallforwardcopy
|
||
|
|
||
|
/* check for 16 byte src/dest unalignment */
|
||
|
tst r0, #0xf
|
||
|
bne Lsimilarlyunaligned
|
||
|
|
||
|
/* check for 32 byte dest unalignment */
|
||
|
tst r0, #(1<<4)
|
||
|
bne Lunaligned_32
|
||
|
|
||
|
Lmorethan64_aligned:
|
||
|
/* save some more registers to use in the copy */
|
||
|
stmfd sp!, { r6, r7, r10, r11 }
|
||
|
|
||
|
/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
|
||
|
sub r2, r2, #64
|
||
|
|
||
|
L64loop:
|
||
|
/* copy 64 bytes at a time */
|
||
|
ldmia r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
#if ARCH_ARMv7
|
||
|
pld [r1, #32]
|
||
|
#endif
|
||
|
stmia r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
ldmia r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
subs r2, r2, #64
|
||
|
#if ARCH_ARMv7
|
||
|
pld [r1, #32]
|
||
|
#endif
|
||
|
stmia r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
bge L64loop
|
||
|
|
||
|
/* restore the scratch registers we just saved */
|
||
|
ldmfd sp!, { r6, r7, r10, r11 }
|
||
|
|
||
|
/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
|
||
|
adds r2, r2, #64
|
||
|
beq Lexit
|
||
|
|
||
|
Llessthan64_aligned:
|
||
|
/* copy 16 bytes at a time until we have < 16 bytes */
|
||
|
cmp r2, #16
|
||
|
ldmiage r1!, { r3, r4, r5, r12 }
|
||
|
stmiage r0!, { r3, r4, r5, r12 }
|
||
|
subsge r2, r2, #16
|
||
|
bgt Llessthan64_aligned
|
||
|
beq Lexit
|
||
|
|
||
|
Llessthan16_aligned:
|
||
|
mov r2, r2, lsl #28
|
||
|
msr cpsr_f, r2
|
||
|
|
||
|
ldmiami r1!, { r2, r3 }
|
||
|
ldreq r4, [r1], #4
|
||
|
ldrhcs r5, [r1], #2
|
||
|
ldrbvs r12, [r1], #1
|
||
|
|
||
|
stmiami r0!, { r2, r3 }
|
||
|
streq r4, [r0], #4
|
||
|
strhcs r5, [r0], #2
|
||
|
strbvs r12, [r0], #1
|
||
|
b Lexit
|
||
|
|
||
|
Lsimilarlyunaligned:
|
||
|
/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
|
||
|
mov r12, r0, lsl #28
|
||
|
rsb r12, r12, #0
|
||
|
msr cpsr_f, r12
|
||
|
|
||
|
ldrbvs r3, [r1], #1
|
||
|
ldrhcs r4, [r1], #2
|
||
|
ldreq r5, [r1], #4
|
||
|
|
||
|
strbvs r3, [r0], #1
|
||
|
strhcs r4, [r0], #2
|
||
|
streq r5, [r0], #4
|
||
|
|
||
|
ldmiami r1!, { r3, r4 }
|
||
|
stmiami r0!, { r3, r4 }
|
||
|
|
||
|
subs r2, r2, r12, lsr #28
|
||
|
beq Lexit
|
||
|
|
||
|
Lunaligned_32:
|
||
|
/* bring up to dest 32 byte alignment */
|
||
|
tst r0, #(1 << 4)
|
||
|
ldmiane r1!, { r3, r4, r5, r12 }
|
||
|
stmiane r0!, { r3, r4, r5, r12 }
|
||
|
subne r2, r2, #16
|
||
|
|
||
|
/* we should now be aligned, see what copy method we should use */
|
||
|
cmp r2, #64
|
||
|
bge Lmorethan64_aligned
|
||
|
b Llessthan64_aligned
|
||
|
|
||
|
Lbytewise2:
|
||
|
/* copy 2 bytes at a time */
|
||
|
subs r2, r2, #2
|
||
|
|
||
|
ldrb r3, [r1], #1
|
||
|
ldrbpl r4, [r1], #1
|
||
|
|
||
|
strb r3, [r0], #1
|
||
|
strbpl r4, [r0], #1
|
||
|
|
||
|
bhi Lbytewise2
|
||
|
b Lexit
|
||
|
|
||
|
Lbytewise:
|
||
|
/* simple bytewise forward copy */
|
||
|
ldrb r3, [r1], #1
|
||
|
subs r2, r2, #1
|
||
|
strb r3, [r0], #1
|
||
|
bne Lbytewise
|
||
|
b Lexit
|
||
|
|
||
|
Lsmallforwardcopy:
|
||
|
/* src and dest are word aligned similarly, less than 64 bytes to copy */
|
||
|
cmp r2, #4
|
||
|
blt Lbytewise2
|
||
|
|
||
|
/* bytewise copy until word aligned */
|
||
|
tst r1, #3
|
||
|
Lwordalignloop:
|
||
|
ldrbne r3, [r1], #1
|
||
|
strbne r3, [r0], #1
|
||
|
subne r2, r2, #1
|
||
|
tstne r1, #3
|
||
|
bne Lwordalignloop
|
||
|
|
||
|
cmp r2, #16
|
||
|
bge Llessthan64_aligned
|
||
|
blt Llessthan16_aligned
|
||
|
|
||
|
Loverlap:
|
||
|
/* src and dest overlap in some way, len > 0 */
|
||
|
cmp r0, r1 /* if dest > src */
|
||
|
bhi Loverlap_srclower
|
||
|
|
||
|
Loverlap_destlower:
|
||
|
/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
|
||
|
cmp r3, #64
|
||
|
bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */
|
||
|
|
||
|
cmp r3, #2
|
||
|
bge Lbytewise2
|
||
|
b Lbytewise
|
||
|
|
||
|
/* the following routines deal with having to copy in the reverse direction */
|
||
|
Loverlap_srclower:
|
||
|
/* src < dest, with overlap */
|
||
|
|
||
|
/* src += len; dest += len; */
|
||
|
add r0, r0, r2
|
||
|
add r1, r1, r2
|
||
|
|
||
|
/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
|
||
|
cmp r2, #64 /* less than 64 bytes to copy? */
|
||
|
cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */
|
||
|
blt Lbytewise_reverse
|
||
|
|
||
|
/* test of src and dest are nonword aligned differently */
|
||
|
mov r3, r0, lsl #30
|
||
|
cmp r3, r1, lsl #30
|
||
|
bne Lbytewise_reverse
|
||
|
|
||
|
/* test if src and dest are non word aligned or dest is non 16 byte aligned */
|
||
|
tst r0, #0xf
|
||
|
bne Lunaligned_reverse_similarly
|
||
|
|
||
|
/* test for dest 32 byte alignment */
|
||
|
tst r0, #(1<<4)
|
||
|
bne Lunaligned_32_reverse_similarly
|
||
|
|
||
|
/* 64 byte reverse block copy, src and dest aligned */
|
||
|
Lmorethan64_aligned_reverse:
|
||
|
/* save some more registers to use in the copy */
|
||
|
stmfd sp!, { r6, r7, r10, r11 }
|
||
|
|
||
|
/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
|
||
|
sub r2, r2, #64
|
||
|
|
||
|
L64loop_reverse:
|
||
|
/* copy 64 bytes at a time */
|
||
|
ldmdb r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
#if ARCH_ARMv7
|
||
|
pld [r1, #-32]
|
||
|
#endif
|
||
|
stmdb r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
ldmdb r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
subs r2, r2, #64
|
||
|
#if ARCH_ARMv7
|
||
|
pld [r1, #-32]
|
||
|
#endif
|
||
|
stmdb r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
|
||
|
bge L64loop_reverse
|
||
|
|
||
|
/* restore the scratch registers we just saved */
|
||
|
ldmfd sp!, { r6, r7, r10, r11 }
|
||
|
|
||
|
/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
|
||
|
adds r2, r2, #64
|
||
|
beq Lexit
|
||
|
|
||
|
Lbytewise_reverse:
|
||
|
ldrb r3, [r1, #-1]!
|
||
|
strb r3, [r0, #-1]!
|
||
|
subs r2, r2, #1
|
||
|
bne Lbytewise_reverse
|
||
|
b Lexit
|
||
|
|
||
|
Lunaligned_reverse_similarly:
|
||
|
/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
|
||
|
mov r12, r0, lsl #28
|
||
|
msr cpsr_f, r12
|
||
|
|
||
|
ldrbvs r3, [r1, #-1]!
|
||
|
ldrhcs r4, [r1, #-2]!
|
||
|
ldreq r5, [r1, #-4]!
|
||
|
|
||
|
strbvs r3, [r0, #-1]!
|
||
|
strhcs r4, [r0, #-2]!
|
||
|
streq r5, [r0, #-4]!
|
||
|
|
||
|
ldmdbmi r1!, { r3, r4 }
|
||
|
stmdbmi r0!, { r3, r4 }
|
||
|
|
||
|
subs r2, r2, r12, lsr #28
|
||
|
beq Lexit
|
||
|
|
||
|
Lunaligned_32_reverse_similarly:
|
||
|
/* bring up to dest 32 byte alignment */
|
||
|
tst r0, #(1 << 4)
|
||
|
ldmdbne r1!, { r3, r4, r5, r12 }
|
||
|
stmdbne r0!, { r3, r4, r5, r12 }
|
||
|
subne r2, r2, #16
|
||
|
|
||
|
/* we should now be aligned, see what copy method we should use */
|
||
|
cmp r2, #64
|
||
|
bge Lmorethan64_aligned_reverse
|
||
|
b Lbytewise_reverse
|
||
|
|
||
|
/* the following routines deal with non word aligned copies */
|
||
|
Lnonwordaligned_forward:
|
||
|
cmp r2, #8
|
||
|
blt Lbytewise2 /* not worth the effort with less than 24 bytes total */
|
||
|
|
||
|
/* bytewise copy until src word aligned */
|
||
|
tst r1, #3
|
||
|
Lwordalignloop2:
|
||
|
ldrbne r3, [r1], #1
|
||
|
strbne r3, [r0], #1
|
||
|
subne r2, r2, #1
|
||
|
tstne r1, #3
|
||
|
bne Lwordalignloop2
|
||
|
|
||
|
/* figure out how the src and dest are unaligned */
|
||
|
and r3, r0, #3
|
||
|
cmp r3, #2
|
||
|
blt Lalign1_forward
|
||
|
beq Lalign2_forward
|
||
|
bgt Lalign3_forward
|
||
|
|
||
|
Lalign1_forward:
|
||
|
/* the dest pointer is 1 byte off from src */
|
||
|
mov r12, r2, lsr #2 /* number of words we should copy */
|
||
|
sub r0, r0, #1
|
||
|
|
||
|
/* prime the copy */
|
||
|
ldrb r4, [r0] /* load D[7:0] */
|
||
|
|
||
|
Lalign1_forward_loop:
|
||
|
ldr r3, [r1], #4 /* load S */
|
||
|
orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */
|
||
|
str r4, [r0], #4 /* save D */
|
||
|
mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */
|
||
|
subs r12, r12, #1
|
||
|
bne Lalign1_forward_loop
|
||
|
|
||
|
/* finish the copy off */
|
||
|
strb r4, [r0], #1 /* save D[7:0] */
|
||
|
|
||
|
ands r2, r2, #3
|
||
|
beq Lexit
|
||
|
b Lbytewise2
|
||
|
|
||
|
Lalign2_forward:
|
||
|
/* the dest pointer is 2 bytes off from src */
|
||
|
mov r12, r2, lsr #2 /* number of words we should copy */
|
||
|
sub r0, r0, #2
|
||
|
|
||
|
/* prime the copy */
|
||
|
ldrh r4, [r0] /* load D[15:0] */
|
||
|
|
||
|
Lalign2_forward_loop:
|
||
|
ldr r3, [r1], #4 /* load S */
|
||
|
orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */
|
||
|
str r4, [r0], #4 /* save D */
|
||
|
mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */
|
||
|
subs r12, r12, #1
|
||
|
bne Lalign2_forward_loop
|
||
|
|
||
|
/* finish the copy off */
|
||
|
strh r4, [r0], #2 /* save D[15:0] */
|
||
|
|
||
|
ands r2, r2, #3
|
||
|
beq Lexit
|
||
|
b Lbytewise2
|
||
|
|
||
|
Lalign3_forward:
|
||
|
/* the dest pointer is 3 bytes off from src */
|
||
|
mov r12, r2, lsr #2 /* number of words we should copy */
|
||
|
sub r0, r0, #3
|
||
|
|
||
|
/* prime the copy */
|
||
|
ldr r4, [r0]
|
||
|
and r4, r4, #0x00ffffff /* load D[24:0] */
|
||
|
|
||
|
Lalign3_forward_loop:
|
||
|
ldr r3, [r1], #4 /* load S */
|
||
|
orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */
|
||
|
str r4, [r0], #4 /* save D */
|
||
|
mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */
|
||
|
subs r12, r12, #1
|
||
|
bne Lalign3_forward_loop
|
||
|
|
||
|
/* finish the copy off */
|
||
|
strh r4, [r0], #2 /* save D[15:0] */
|
||
|
mov r4, r4, lsr #16
|
||
|
strb r4, [r0], #1 /* save D[23:16] */
|
||
|
|
||
|
ands r2, r2, #3
|
||
|
beq Lexit
|
||
|
b Lbytewise2
|
||
|
|
||
|
Lexit:
|
||
|
ldmfd sp!, { r0, r4, r5 }
|
||
|
bx lr
|
||
|
|