iBoot/lib/libc/arm/bcopy.S

/*
 * Copyright (C) 2006 Apple Computer, Inc. All rights reserved.
 *
 * This document is the property of Apple Computer, Inc.
 * It is considered confidential and proprietary.
 *
 * This document may not be reproduced or transmitted in any form,
 * in whole or in part, without the express written permission of
 * Apple Computer, Inc.
 */
.text
.align 2
	
	.globl _memcpy
	.globl _bcopy
	.globl _memmove

_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
	mov		r3, r0
	mov		r0, r1
	mov		r1, r3

_memcpy:		/* void *memcpy(void *dest, const void *src, size_t len); */
_memmove: 	/* void *memmove(void *dest, const void *src, size_t len); */
	/* check for zero len or if the pointers are the same */
	cmp		r2, #0
	cmpne	r0, r1
	bxeq	lr

	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
	stmfd	sp!, { r0, r4, r5 }
	
	/* check for overlap. r3 <- distance between src & dest */
	subhs	r3, r0, r1
	sublo	r3, r1, r0
	cmp		r3, r2			/* if distance(src, dest) < len, we have overlap */
	blo		Loverlap

Lnormalforwardcopy:
	/* are src and dest dissimilarly word aligned? */
	mov		r12, r0, lsl #30
	cmp		r12, r1, lsl #30
	bne		Lnonwordaligned_forward

	/* if len < 64, do a quick forward copy */
	cmp		r2, #64
	blt		Lsmallforwardcopy

	/* check for 16 byte src/dest unalignment */
	tst		r0, #0xf
	bne		Lsimilarlyunaligned

	/* check for 32 byte dest unalignment */
	tst		r0, #(1<<4)
	bne		Lunaligned_32

Lmorethan64_aligned:
	/* save some more registers to use in the copy */
	stmfd	sp!, { r6, r7, r10, r11 }

	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
	sub		r2, r2, #64

L64loop:
	/* copy 64 bytes at a time */
	ldmia	r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
#if ARCH_ARMv7
	pld		[r1, #32]
#endif
	stmia	r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
	ldmia	r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
	subs	r2, r2, #64
#if ARCH_ARMv7
	pld		[r1, #32]
#endif
	stmia	r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }
	bge		L64loop

	/* restore the scratch registers we just saved */
	ldmfd	sp!, { r6, r7, r10, r11 }

	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
	adds	r2, r2, #64
	beq		Lexit

Llessthan64_aligned:
	/* copy 16 bytes at a time until we have < 16 bytes */
	cmp		r2, #16
	ldmiage	r1!, { r3, r4, r5, r12 }
	stmiage	r0!, { r3, r4, r5, r12 }
	subsge	r2, r2, #16
	bgt		Llessthan64_aligned
	beq		Lexit
	
Llessthan16_aligned:
	mov		r2, r2, lsl #28
	msr		cpsr_f, r2

	ldmiami	r1!, { r2, r3 }
	ldreq	r4, [r1], #4
	ldrhcs	r5, [r1], #2
	ldrbvs	r12, [r1], #1

	stmiami	r0!, { r2, r3 }
	streq	r4, [r0], #4
	strhcs	r5, [r0], #2
	strbvs	r12, [r0], #1
	b		Lexit

Lsimilarlyunaligned:
	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
	mov		r12, r0, lsl #28
	rsb		r12, r12, #0
	msr		cpsr_f, r12

	ldrbvs	r3, [r1], #1
	ldrhcs	r4, [r1], #2
	ldreq	r5, [r1], #4

	strbvs	r3, [r0], #1
	strhcs	r4, [r0], #2
	streq	r5, [r0], #4

	ldmiami	r1!, { r3, r4 }
	stmiami	r0!, { r3, r4 }

	subs	r2, r2, r12, lsr #28
	beq		Lexit

Lunaligned_32:
	/* bring up to dest 32 byte alignment */
	tst		r0, #(1 << 4)
	ldmiane	r1!, { r3, r4, r5, r12 }
	stmiane	r0!, { r3, r4, r5, r12 }
	subne	r2, r2, #16

	/* we should now be aligned, see what copy method we should use */
	cmp		r2, #64
	bge		Lmorethan64_aligned
	b		Llessthan64_aligned
	
Lbytewise2:
	/* copy 2 bytes at a time */
	subs	r2, r2, #2

	ldrb	r3, [r1], #1
	ldrbpl	r4, [r1], #1

	strb	r3, [r0], #1
	strbpl	r4, [r0], #1

	bhi		Lbytewise2
	b		Lexit

Lbytewise:
	/* simple bytewise forward copy */
	ldrb	r3, [r1], #1
	subs	r2, r2, #1
	strb	r3, [r0], #1
	bne		Lbytewise
	b		Lexit

Lsmallforwardcopy:
	/* src and dest are word aligned similarly, less than 64 bytes to copy */
	cmp		r2, #4
	blt		Lbytewise2

	/* bytewise copy until word aligned */
	tst		r1, #3
Lwordalignloop:
	ldrbne	r3, [r1], #1
	strbne	r3, [r0], #1
	subne	r2, r2, #1
	tstne	r1, #3
	bne		Lwordalignloop

	cmp		r2, #16
	bge		Llessthan64_aligned
	blt		Llessthan16_aligned

Loverlap:
	/* src and dest overlap in some way, len > 0 */
	cmp		r0, r1				/* if dest > src */
	bhi		Loverlap_srclower

Loverlap_destlower:
	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
	cmp		r3, #64
	bge		Lnormalforwardcopy 	/* overlap is greater than one stride of the copy, use normal copy */

	cmp		r3, #2
	bge		Lbytewise2
	b		Lbytewise

	/* the following routines deal with having to copy in the reverse direction */
Loverlap_srclower:
	/* src < dest, with overlap */

	/* src += len; dest += len; */
	add		r0, r0, r2
	add		r1, r1, r2

	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
	cmp		r2, #64				/* less than 64 bytes to copy? */
	cmpgt	r3, #64				/* less than 64 bytes of nonoverlap? */
	blt		Lbytewise_reverse

	/* test of src and dest are nonword aligned differently */
	mov		r3, r0, lsl #30
	cmp		r3, r1, lsl #30
	bne		Lbytewise_reverse

	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
	tst		r0, #0xf
	bne		Lunaligned_reverse_similarly

	/* test for dest 32 byte alignment */
	tst		r0, #(1<<4)
	bne		Lunaligned_32_reverse_similarly

	/* 64 byte reverse block copy, src and dest aligned */
Lmorethan64_aligned_reverse:
	/* save some more registers to use in the copy */
	stmfd	sp!, { r6, r7, r10, r11 }

	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
	sub		r2, r2, #64

L64loop_reverse:
	/* copy 64 bytes at a time */
	ldmdb	r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }
#if ARCH_ARMv7
	pld		[r1, #-32]
#endif
	stmdb	r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }	
	ldmdb	r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }	
	subs	r2, r2, #64
#if ARCH_ARMv7
	pld		[r1, #-32]
#endif
	stmdb	r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }	
	bge		L64loop_reverse

	/* restore the scratch registers we just saved */
	ldmfd	sp!, { r6, r7, r10, r11 }

	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
	adds	r2, r2, #64
	beq		Lexit

Lbytewise_reverse:
	ldrb	r3, [r1, #-1]!
	strb	r3, [r0, #-1]!
	subs	r2, r2, #1
	bne		Lbytewise_reverse
	b		Lexit

Lunaligned_reverse_similarly:
	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
	mov		r12, r0, lsl #28
	msr		cpsr_f, r12

	ldrbvs	r3, [r1, #-1]!
	ldrhcs	r4, [r1, #-2]!
	ldreq	r5, [r1, #-4]!

	strbvs	r3, [r0, #-1]!
	strhcs	r4, [r0, #-2]!
	streq	r5, [r0, #-4]!

	ldmdbmi	r1!, { r3, r4 }
	stmdbmi	r0!, { r3, r4 }

	subs	r2, r2, r12, lsr #28
	beq		Lexit

Lunaligned_32_reverse_similarly:
	/* bring up to dest 32 byte alignment */
	tst		r0, #(1 << 4)
	ldmdbne	r1!, { r3, r4, r5, r12 }
	stmdbne	r0!, { r3, r4, r5, r12 }
	subne	r2, r2, #16

	/* we should now be aligned, see what copy method we should use */
	cmp		r2, #64
	bge		Lmorethan64_aligned_reverse
	b		Lbytewise_reverse

	/* the following routines deal with non word aligned copies */
Lnonwordaligned_forward:
	cmp		r2, #8
	blt		Lbytewise2			/* not worth the effort with less than 24 bytes total */

	/* bytewise copy until src word aligned */
	tst		r1, #3
Lwordalignloop2:
	ldrbne	r3, [r1], #1
	strbne	r3, [r0], #1
	subne	r2, r2, #1
	tstne	r1, #3
	bne		Lwordalignloop2

	/* figure out how the src and dest are unaligned */
	and		r3, r0, #3
	cmp		r3, #2
	blt		Lalign1_forward
	beq		Lalign2_forward
	bgt		Lalign3_forward

Lalign1_forward:
	/* the dest pointer is 1 byte off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #1

	/* prime the copy */
	ldrb	r4, [r0]			/* load D[7:0] */

Lalign1_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #8	/* D[31:8] = S[24:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #24		/* D[7:0] = S[31:25] */
	subs	r12, r12, #1
	bne		Lalign1_forward_loop

	/* finish the copy off */
	strb	r4, [r0], #1		/* save D[7:0] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lalign2_forward:
	/* the dest pointer is 2 bytes off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #2

	/* prime the copy */
	ldrh	r4, [r0]			/* load D[15:0] */

Lalign2_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #16	/* D[31:16] = S[15:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #16		/* D[15:0] = S[31:15] */
	subs	r12, r12, #1
	bne		Lalign2_forward_loop

	/* finish the copy off */
	strh	r4, [r0], #2		/* save D[15:0] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lalign3_forward:
	/* the dest pointer is 3 bytes off from src */
	mov		r12, r2, lsr #2		/* number of words we should copy */
	sub		r0, r0, #3

	/* prime the copy */
	ldr		r4, [r0]
	and		r4, r4, #0x00ffffff	/* load D[24:0] */

Lalign3_forward_loop:
	ldr		r3, [r1], #4		/* load S */
	orr		r4, r4, r3, lsl #24	/* D[31:25] = S[7:0] */
	str		r4, [r0], #4		/* save D */
	mov		r4, r3, lsr #8		/* D[24:0] = S[31:8] */
	subs	r12, r12, #1
	bne		Lalign3_forward_loop

	/* finish the copy off */
	strh	r4, [r0], #2		/* save D[15:0] */
	mov		r4, r4, lsr #16
	strb	r4, [r0], #1		/* save D[23:16] */

	ands	r2, r2, #3
	beq		Lexit
	b		Lbytewise2

Lexit:
	ldmfd	sp!, { r0, r4, r5 }
	bx		lr
first and last commit 2023-07-08 13:03:17 -07:00			`/*`
			`* Copyright (C) 2006 Apple Computer, Inc. All rights reserved.`
			`*`
			`* This document is the property of Apple Computer, Inc.`
			`* It is considered confidential and proprietary.`
			`*`
			`* This document may not be reproduced or transmitted in any form,`
			`* in whole or in part, without the express written permission of`
			`* Apple Computer, Inc.`
			`*/`
			`.text`
			`.align 2`

			`.globl _memcpy`
			`.globl _bcopy`
			`.globl _memmove`

			`_bcopy: /* void bcopy(const void src, void dest, size_t len); */`
			`mov r3, r0`
			`mov r0, r1`
			`mov r1, r3`

			`_memcpy: /* void memcpy(void dest, const void src, size_t len); /`
			`_memmove: /* void memmove(void dest, const void src, size_t len); /`
			`/* check for zero len or if the pointers are the same */`
			`cmp r2, #0`
			`cmpne r0, r1`
			`bxeq lr`

			`/* save r0 (return value), r4 (scratch), and r5 (scratch) */`
			`stmfd sp!, { r0, r4, r5 }`

			`/* check for overlap. r3 <- distance between src & dest */`
			`subhs r3, r0, r1`
			`sublo r3, r1, r0`
			`cmp r3, r2 /* if distance(src, dest) < len, we have overlap */`
			`blo Loverlap`

			`Lnormalforwardcopy:`
			`/* are src and dest dissimilarly word aligned? */`
			`mov r12, r0, lsl #30`
			`cmp r12, r1, lsl #30`
			`bne Lnonwordaligned_forward`

			`/* if len < 64, do a quick forward copy */`
			`cmp r2, #64`
			`blt Lsmallforwardcopy`

			`/* check for 16 byte src/dest unalignment */`
			`tst r0, #0xf`
			`bne Lsimilarlyunaligned`

			`/* check for 32 byte dest unalignment */`
			`tst r0, #(1<<4)`
			`bne Lunaligned_32`

			`Lmorethan64_aligned:`
			`/* save some more registers to use in the copy */`
			`stmfd sp!, { r6, r7, r10, r11 }`

			`/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */`
			`sub r2, r2, #64`

			`L64loop:`
			`/* copy 64 bytes at a time */`
			`ldmia r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`#if ARCH_ARMv7`
			`pld [r1, #32]`
			`#endif`
			`stmia r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`ldmia r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`subs r2, r2, #64`
			`#if ARCH_ARMv7`
			`pld [r1, #32]`
			`#endif`
			`stmia r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`bge L64loop`

			`/* restore the scratch registers we just saved */`
			`ldmfd sp!, { r6, r7, r10, r11 }`

			`/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */`
			`adds r2, r2, #64`
			`beq Lexit`

			`Llessthan64_aligned:`
			`/* copy 16 bytes at a time until we have < 16 bytes */`
			`cmp r2, #16`
			`ldmiage r1!, { r3, r4, r5, r12 }`
			`stmiage r0!, { r3, r4, r5, r12 }`
			`subsge r2, r2, #16`
			`bgt Llessthan64_aligned`
			`beq Lexit`

			`Llessthan16_aligned:`
			`mov r2, r2, lsl #28`
			`msr cpsr_f, r2`

			`ldmiami r1!, { r2, r3 }`
			`ldreq r4, [r1], #4`
			`ldrhcs r5, [r1], #2`
			`ldrbvs r12, [r1], #1`

			`stmiami r0!, { r2, r3 }`
			`streq r4, [r0], #4`
			`strhcs r5, [r0], #2`
			`strbvs r12, [r0], #1`
			`b Lexit`

			`Lsimilarlyunaligned:`
			`/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */`
			`mov r12, r0, lsl #28`
			`rsb r12, r12, #0`
			`msr cpsr_f, r12`

			`ldrbvs r3, [r1], #1`
			`ldrhcs r4, [r1], #2`
			`ldreq r5, [r1], #4`

			`strbvs r3, [r0], #1`
			`strhcs r4, [r0], #2`
			`streq r5, [r0], #4`

			`ldmiami r1!, { r3, r4 }`
			`stmiami r0!, { r3, r4 }`

			`subs r2, r2, r12, lsr #28`
			`beq Lexit`

			`Lunaligned_32:`
			`/* bring up to dest 32 byte alignment */`
			`tst r0, #(1 << 4)`
			`ldmiane r1!, { r3, r4, r5, r12 }`
			`stmiane r0!, { r3, r4, r5, r12 }`
			`subne r2, r2, #16`

			`/* we should now be aligned, see what copy method we should use */`
			`cmp r2, #64`
			`bge Lmorethan64_aligned`
			`b Llessthan64_aligned`

			`Lbytewise2:`
			`/* copy 2 bytes at a time */`
			`subs r2, r2, #2`

			`ldrb r3, [r1], #1`
			`ldrbpl r4, [r1], #1`

			`strb r3, [r0], #1`
			`strbpl r4, [r0], #1`

			`bhi Lbytewise2`
			`b Lexit`

			`Lbytewise:`
			`/* simple bytewise forward copy */`
			`ldrb r3, [r1], #1`
			`subs r2, r2, #1`
			`strb r3, [r0], #1`
			`bne Lbytewise`
			`b Lexit`

			`Lsmallforwardcopy:`
			`/* src and dest are word aligned similarly, less than 64 bytes to copy */`
			`cmp r2, #4`
			`blt Lbytewise2`

			`/* bytewise copy until word aligned */`
			`tst r1, #3`
			`Lwordalignloop:`
			`ldrbne r3, [r1], #1`
			`strbne r3, [r0], #1`
			`subne r2, r2, #1`
			`tstne r1, #3`
			`bne Lwordalignloop`

			`cmp r2, #16`
			`bge Llessthan64_aligned`
			`blt Llessthan16_aligned`

			`Loverlap:`
			`/* src and dest overlap in some way, len > 0 */`
			`cmp r0, r1 /* if dest > src */`
			`bhi Loverlap_srclower`

			`Loverlap_destlower:`
			`/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */`
			`cmp r3, #64`
			`bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */`

			`cmp r3, #2`
			`bge Lbytewise2`
			`b Lbytewise`

			`/* the following routines deal with having to copy in the reverse direction */`
			`Loverlap_srclower:`
			`/* src < dest, with overlap */`

			`/* src += len; dest += len; */`
			`add r0, r0, r2`
			`add r1, r1, r2`

			`/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */`
			`cmp r2, #64 /* less than 64 bytes to copy? */`
			`cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */`
			`blt Lbytewise_reverse`

			`/* test of src and dest are nonword aligned differently */`
			`mov r3, r0, lsl #30`
			`cmp r3, r1, lsl #30`
			`bne Lbytewise_reverse`

			`/* test if src and dest are non word aligned or dest is non 16 byte aligned */`
			`tst r0, #0xf`
			`bne Lunaligned_reverse_similarly`

			`/* test for dest 32 byte alignment */`
			`tst r0, #(1<<4)`
			`bne Lunaligned_32_reverse_similarly`

			`/* 64 byte reverse block copy, src and dest aligned */`
			`Lmorethan64_aligned_reverse:`
			`/* save some more registers to use in the copy */`
			`stmfd sp!, { r6, r7, r10, r11 }`

			`/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */`
			`sub r2, r2, #64`

			`L64loop_reverse:`
			`/* copy 64 bytes at a time */`
			`ldmdb r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`#if ARCH_ARMv7`
			`pld [r1, #-32]`
			`#endif`
			`stmdb r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`ldmdb r1!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`subs r2, r2, #64`
			`#if ARCH_ARMv7`
			`pld [r1, #-32]`
			`#endif`
			`stmdb r0!, { r3, r4, r5, r6, r7, r10, r11, r12 }`
			`bge L64loop_reverse`

			`/* restore the scratch registers we just saved */`
			`ldmfd sp!, { r6, r7, r10, r11 }`

			`/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */`
			`adds r2, r2, #64`
			`beq Lexit`

			`Lbytewise_reverse:`
			`ldrb r3, [r1, #-1]!`
			`strb r3, [r0, #-1]!`
			`subs r2, r2, #1`
			`bne Lbytewise_reverse`
			`b Lexit`

			`Lunaligned_reverse_similarly:`
			`/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */`
			`mov r12, r0, lsl #28`
			`msr cpsr_f, r12`

			`ldrbvs r3, [r1, #-1]!`
			`ldrhcs r4, [r1, #-2]!`
			`ldreq r5, [r1, #-4]!`

			`strbvs r3, [r0, #-1]!`
			`strhcs r4, [r0, #-2]!`
			`streq r5, [r0, #-4]!`

			`ldmdbmi r1!, { r3, r4 }`
			`stmdbmi r0!, { r3, r4 }`

			`subs r2, r2, r12, lsr #28`
			`beq Lexit`

			`Lunaligned_32_reverse_similarly:`
			`/* bring up to dest 32 byte alignment */`
			`tst r0, #(1 << 4)`
			`ldmdbne r1!, { r3, r4, r5, r12 }`
			`stmdbne r0!, { r3, r4, r5, r12 }`
			`subne r2, r2, #16`

			`/* we should now be aligned, see what copy method we should use */`
			`cmp r2, #64`
			`bge Lmorethan64_aligned_reverse`
			`b Lbytewise_reverse`

			`/* the following routines deal with non word aligned copies */`
			`Lnonwordaligned_forward:`
			`cmp r2, #8`
			`blt Lbytewise2 /* not worth the effort with less than 24 bytes total */`

			`/* bytewise copy until src word aligned */`
			`tst r1, #3`
			`Lwordalignloop2:`
			`ldrbne r3, [r1], #1`
			`strbne r3, [r0], #1`
			`subne r2, r2, #1`
			`tstne r1, #3`
			`bne Lwordalignloop2`

			`/* figure out how the src and dest are unaligned */`
			`and r3, r0, #3`
			`cmp r3, #2`
			`blt Lalign1_forward`
			`beq Lalign2_forward`
			`bgt Lalign3_forward`

			`Lalign1_forward:`
			`/* the dest pointer is 1 byte off from src */`
			`mov r12, r2, lsr #2 /* number of words we should copy */`
			`sub r0, r0, #1`

			`/* prime the copy */`
			`ldrb r4, [r0] /* load D[7:0] */`

			`Lalign1_forward_loop:`
			`ldr r3, [r1], #4 /* load S */`
			`orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */`
			`str r4, [r0], #4 /* save D */`
			`mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */`
			`subs r12, r12, #1`
			`bne Lalign1_forward_loop`

			`/* finish the copy off */`
			`strb r4, [r0], #1 /* save D[7:0] */`

			`ands r2, r2, #3`
			`beq Lexit`
			`b Lbytewise2`

			`Lalign2_forward:`
			`/* the dest pointer is 2 bytes off from src */`
			`mov r12, r2, lsr #2 /* number of words we should copy */`
			`sub r0, r0, #2`

			`/* prime the copy */`
			`ldrh r4, [r0] /* load D[15:0] */`

			`Lalign2_forward_loop:`
			`ldr r3, [r1], #4 /* load S */`
			`orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */`
			`str r4, [r0], #4 /* save D */`
			`mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */`
			`subs r12, r12, #1`
			`bne Lalign2_forward_loop`

			`/* finish the copy off */`
			`strh r4, [r0], #2 /* save D[15:0] */`

			`ands r2, r2, #3`
			`beq Lexit`
			`b Lbytewise2`

			`Lalign3_forward:`
			`/* the dest pointer is 3 bytes off from src */`
			`mov r12, r2, lsr #2 /* number of words we should copy */`
			`sub r0, r0, #3`

			`/* prime the copy */`
			`ldr r4, [r0]`
			`and r4, r4, #0x00ffffff /* load D[24:0] */`

			`Lalign3_forward_loop:`
			`ldr r3, [r1], #4 /* load S */`
			`orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */`
			`str r4, [r0], #4 /* save D */`
			`mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */`
			`subs r12, r12, #1`
			`bne Lalign3_forward_loop`

			`/* finish the copy off */`
			`strh r4, [r0], #2 /* save D[15:0] */`
			`mov r4, r4, lsr #16`
			`strb r4, [r0], #1 /* save D[23:16] */`

			`ands r2, r2, #3`
			`beq Lexit`
			`b Lbytewise2`

			`Lexit:`
			`ldmfd sp!, { r0, r4, r5 }`
			`bx lr`