iBoot/lib/libc/arm/bzero.S

/*
 * Copyright (C) 2006 Apple Computer, Inc. All rights reserved.
 *
 * This document is the property of Apple Computer, Inc.
 * It is considered confidential and proprietary.
 *
 * This document may not be reproduced or transmitted in any form,
 * in whole or in part, without the express written permission of
 * Apple Computer, Inc.
 */
/* 
 * A reasonably well-optimized bzero/memset. Should work equally well on arm11 and arm9 based
 * cores. 
 *
 * The algorithm is to align the destination pointer on a 32 byte boundary and then
 * blast data 64 bytes at a time, in two stores of 32 bytes per loop.
 */
	.text
	.align 2

	.globl _memset
/* void *memset(void *ptr, int c, size_t len); */
_memset:
	/* move len into r1, unpack c into r2 */
	mov		r3, r2
	and		r1, r1, #0xff
	orr		r1, r1, r1, lsl #8
	orr		r2, r1, r1, lsl #16
	mov		r1, r3
	b		Lbzeroengine

	.globl _bzero
/* void bzero(void *ptr, size_t len); */
_bzero:
	/* zero out r2 so we can be just like memset(0) */
	mov		r2, #0

Lbzeroengine:
	/* move the base pointer into r12 and leave r0 alone so that we return the original pointer */
	mov		r12, r0

	/* copy r2 into r3 for 64-bit stores */
	mov		r3, r2

	/* check for zero len */
	cmp		r1, #0
	bxeq	lr

	/* fall back to a bytewise store for less than 32 bytes */
	cmp		r1, #32
	blt		L_bytewise

	/* check for 32 byte unaligned ptr */
	tst		r12, #0x1f
	bne		L_unaligned

	/* make sure we have more than 64 bytes to zero */
	cmp		r1, #64
	blt		L_lessthan64aligned

	/* >= 64 bytes of len, 32 byte aligned */
L_64ormorealigned:

	/* we need some registers, avoid r9 */
	stmfd	sp!, { r4-r8, r10 }
	mov		r4, r2
	mov		r5, r2
	mov		r6, r2
	mov		r7, r2
	mov		r8, r2
	mov		r10, r2

	/* pre-subtract 64 from the len to avoid an extra compare in the loop */
	sub		r1, r1, #64

L_64loop:
	stmia	r12!, { r2-r8, r10 }
	subs	r1, r1, #64
	stmia	r12!, { r2-r8, r10 }
	bge		L_64loop

	/* restore the saved regs */
	ldmfd	sp!, { r4-r8, r10 }

	/* check for completion (had previously subtracted an extra 64 from len) */
	adds	r1, r1, #64
	bxeq	lr

L_lessthan64aligned:
	/* do we have 16 or more bytes left */
	cmp		r1, #16
	stmiage	r12!, { r2-r3 }
	stmiage	r12!, { r2-r3 }
	subsge	r1, r1, #16
	bgt		L_lessthan64aligned
	bxeq	lr

L_lessthan16aligned:
	/* store 0 to 15 bytes */
	mov		r1, r1, lsl #28		/* move the remaining len bits [3:0] to the flags area of cpsr */
	msr		cpsr_f, r1

	stmiami	r12!, { r2-r3 }		/* n is set, store 8 bytes */
	streq	r2, [r12], #4		/* z is set, store 4 bytes */
	strhcs	r2, [r12], #2		/* c is set, store 2 bytes */
	strbvs	r2, [r12], #1		/* v is set, store 1 byte */
	bx		lr

L_bytewise:
	/* bytewise copy, 2 bytes at a time, alignment not guaranteed */	
	subs	r1, r1, #2
	strb	r2, [r12], #1
	strbpl	r2, [r12], #1
	bhi		L_bytewise
	bx		lr

L_unaligned:
	/* unaligned on 32 byte boundary, store 1-15 bytes until we're 16 byte aligned */
	mov		r3, r12, lsl #28
	rsb     r3, r3, #0x00000000
	msr		cpsr_f, r3

	strbvs	r2, [r12], #1		/* v is set, unaligned in the 1s column */
	strhcs	r2, [r12], #2		/* c is set, unaligned in the 2s column */
	streq	r2, [r12], #4		/* z is set, unaligned in the 4s column */
	strmi	r2, [r12], #4		/* n is set, unaligned in the 8s column */
	strmi	r2, [r12], #4

	subs	r1, r1, r3, lsr #28
	bxeq	lr

	/* we had previously trashed r3, restore it */
	mov		r3, r2

	/* now make sure we're 32 byte aligned */
	tst		r12, #(1 << 4)
	stmiane	r12!, { r2-r3 }
	stmiane	r12!, { r2-r3 }
	subsne	r1, r1, #16

	/* we're now aligned, check for >= 64 bytes left */
	cmp		r1, #64
	bge		L_64ormorealigned
	b		L_lessthan64aligned
first and last commit 2023-07-08 13:03:17 -07:00			`/*`
			`* Copyright (C) 2006 Apple Computer, Inc. All rights reserved.`
			`*`
			`* This document is the property of Apple Computer, Inc.`
			`* It is considered confidential and proprietary.`
			`*`
			`* This document may not be reproduced or transmitted in any form,`
			`* in whole or in part, without the express written permission of`
			`* Apple Computer, Inc.`
			`*/`
			`/*`
			`* A reasonably well-optimized bzero/memset. Should work equally well on arm11 and arm9 based`
			`* cores.`
			`*`
			`* The algorithm is to align the destination pointer on a 32 byte boundary and then`
			`* blast data 64 bytes at a time, in two stores of 32 bytes per loop.`
			`*/`
			`.text`
			`.align 2`

			`.globl _memset`
			`/* void memset(void ptr, int c, size_t len); */`
			`_memset:`
			`/* move len into r1, unpack c into r2 */`
			`mov r3, r2`
			`and r1, r1, #0xff`
			`orr r1, r1, r1, lsl #8`
			`orr r2, r1, r1, lsl #16`
			`mov r1, r3`
			`b Lbzeroengine`

			`.globl _bzero`
			`/* void bzero(void ptr, size_t len); /`
			`_bzero:`
			`/* zero out r2 so we can be just like memset(0) */`
			`mov r2, #0`

			`Lbzeroengine:`
			`/* move the base pointer into r12 and leave r0 alone so that we return the original pointer */`
			`mov r12, r0`

			`/* copy r2 into r3 for 64-bit stores */`
			`mov r3, r2`

			`/* check for zero len */`
			`cmp r1, #0`
			`bxeq lr`

			`/* fall back to a bytewise store for less than 32 bytes */`
			`cmp r1, #32`
			`blt L_bytewise`

			`/* check for 32 byte unaligned ptr */`
			`tst r12, #0x1f`
			`bne L_unaligned`

			`/* make sure we have more than 64 bytes to zero */`
			`cmp r1, #64`
			`blt L_lessthan64aligned`

			`/* >= 64 bytes of len, 32 byte aligned */`
			`L_64ormorealigned:`

			`/* we need some registers, avoid r9 */`
			`stmfd sp!, { r4-r8, r10 }`
			`mov r4, r2`
			`mov r5, r2`
			`mov r6, r2`
			`mov r7, r2`
			`mov r8, r2`
			`mov r10, r2`

			`/* pre-subtract 64 from the len to avoid an extra compare in the loop */`
			`sub r1, r1, #64`

			`L_64loop:`
			`stmia r12!, { r2-r8, r10 }`
			`subs r1, r1, #64`
			`stmia r12!, { r2-r8, r10 }`
			`bge L_64loop`

			`/* restore the saved regs */`
			`ldmfd sp!, { r4-r8, r10 }`

			`/* check for completion (had previously subtracted an extra 64 from len) */`
			`adds r1, r1, #64`
			`bxeq lr`

			`L_lessthan64aligned:`
			`/* do we have 16 or more bytes left */`
			`cmp r1, #16`
			`stmiage r12!, { r2-r3 }`
			`stmiage r12!, { r2-r3 }`
			`subsge r1, r1, #16`
			`bgt L_lessthan64aligned`
			`bxeq lr`

			`L_lessthan16aligned:`
			`/* store 0 to 15 bytes */`
			`mov r1, r1, lsl #28 /* move the remaining len bits [3:0] to the flags area of cpsr */`
			`msr cpsr_f, r1`

			`stmiami r12!, { r2-r3 } /* n is set, store 8 bytes */`
			`streq r2, [r12], #4 /* z is set, store 4 bytes */`
			`strhcs r2, [r12], #2 /* c is set, store 2 bytes */`
			`strbvs r2, [r12], #1 /* v is set, store 1 byte */`
			`bx lr`

			`L_bytewise:`
			`/* bytewise copy, 2 bytes at a time, alignment not guaranteed */`
			`subs r1, r1, #2`
			`strb r2, [r12], #1`
			`strbpl r2, [r12], #1`
			`bhi L_bytewise`
			`bx lr`

			`L_unaligned:`
			`/* unaligned on 32 byte boundary, store 1-15 bytes until we're 16 byte aligned */`
			`mov r3, r12, lsl #28`
			`rsb r3, r3, #0x00000000`
			`msr cpsr_f, r3`

			`strbvs r2, [r12], #1 /* v is set, unaligned in the 1s column */`
			`strhcs r2, [r12], #2 /* c is set, unaligned in the 2s column */`
			`streq r2, [r12], #4 /* z is set, unaligned in the 4s column */`
			`strmi r2, [r12], #4 /* n is set, unaligned in the 8s column */`
			`strmi r2, [r12], #4`

			`subs r1, r1, r3, lsr #28`
			`bxeq lr`

			`/* we had previously trashed r3, restore it */`
			`mov r3, r2`

			`/* now make sure we're 32 byte aligned */`
			`tst r12, #(1 << 4)`
			`stmiane r12!, { r2-r3 }`
			`stmiane r12!, { r2-r3 }`
			`subsne r1, r1, #16`

			`/* we're now aligned, check for >= 64 bytes left */`
			`cmp r1, #64`
			`bge L_64ormorealigned`
			`b L_lessthan64aligned`