iBoot/lib/cksum/arm64/adler32vec.S

319 lines
8.2 KiB
ArmAsm

/*
* Copyright (C) 2013 Apple Inc. All rights reserved.
*
* This document is the property of Apple Computer, Inc.
* It is considered confidential and proprietary.
*
* This document may not be reproduced or transmitted in any form,
* in whole or in part, without the express written permission of
* Apple Inc.
*/
#if defined(__arm64__)
#define BASE 65521 /* largest prime smaller than 65536 */
#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
// Note: buf should have been 16-byte aligned in the caller function,
// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
// unsigned n;
// while (len >= NMAX) {
// len -= NMAX;
// n = NMAX / 16; /* NMAX is divisible by 16 */
// do {
// DO16(buf); /* 16 sums unrolled */
// buf += 16;
// } while (--n);
// MOD(adler);
// MOD(sum2);
// }
// if (len) { /* avoid modulos if none remaining */
// while (len >= 16) {
// len -= 16;
// DO16(buf);
// buf += 16;
// }
// while (len--) {
// adler += *buf++;
// sum2 += adler;
// }
// MOD(adler);
// MOD(sum2);
// }
// return adler | (sum2 << 16); /* return recombined sums */
// }
/*
DO16 vectorization:
given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
adler += (x[0] + x[1] + ... + x[15]);
similarly, 2 DO16 = DO32 vectorization:
given initial unsigned int sum2 and adler, and a new set of 32 input bytes (x[0:31]), it can be shown that
sum2 += (32*adler + 32*x[0] + 31*x[1] + ... + 1*x[31]);
adler += (x[0] + x[1] + ... + x[31]);
NMAX = 5552
NMAX/16 = 347 = 173*2 + 1
therefore, for a block of 5552 bytes
n = 173;
do {
DO32(buf); buf+=32;
} while (n--);
DO16(buf); buf+=16;
MOD(adler);
MOD(sum2);
for a residual remaining len bytes,
while (len >= 32) {
DO32(buf); buf += 32; len -= 32;
}
if (len>=16) {
DO16(buf); buf += 16; len -= 16;
}
while (len--) {
adler += *buf++;
sum2 += adler;
}
MOD(adler);
MOD(sum2);
DO32:
pack sum2:adler in a 64-bit register
0. sum2 += adler*32;
sum2:adler += (sum2:adler) << (32+5);
1. (32, 31, ..., 1) * ( x0, x1, ..., x31) (v2, v3) * (v0, v1)
umull.8h v4, v2, v0
umull2.8h v5, v2, v0
umull.8h v6, v3, v1
umull2.8h v7, v3, v1
add.8h v4, v4, v5
add.8h v6, v6, v7
add.8h v4, v4, v6
uaddlv s4, v4.8h
add sum2, sum2, s4
2. adler += (x0 + x1 + x2 + ... + x31)
uaddlp v0.8h, v0.16b
uaddlp v1.8h, v1.16b
uaddlv s0, v0.8h
uaddlv s1, v1.8h
add s0, s0, s1
add adler, adler, s0
therefore, this is what can be done to vectorize the above computation
1. 16-byte aligned vector load into q2 (x[0:x15])
2. sum2 += (adler<<32); // pack adler/sum2 into a 64-bit word
3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
1. 32-byte aligned vector load into q2,q14 (x[0:x31])
2. sum2 += (adler<<32);
3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
*/
/*
MOD implementation:
adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47
2. vshr.u64 q2,q2,#47 // floor function
3. vpadd.u32 d4,d4,d5 // merge into a double word in d4
4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
*/
.text
.align 2
.globl _adler32_vec
_adler32_vec:
#define adler w0
#define sum2 w1
#define buf x2
#define len w3
#define nmax w4
#define nvec w6 // vecs = NMAX/16
#define t x7
#define vadlersum2 v5 // only lower part
#define adlersum2 d5 // only lower part
add x0, x0, x1, lsl #32
adrp t, vec_table@PAGE
mov nmax, #NMAX // NMAX
add t, t, vec_table@PAGEOFF
ins vadlersum2.d[0], x0
#if KERNEL
sub x6, sp, #8*16
sub sp, sp, #8*16
st1.4s {v0,v1,v2,v3},[x6],#4*16
st1.4s {v4,v5,v6,v7},[x6],#4*16
#endif
cmp len, nmax // len vs NMAX
ld1.4s {v0, v1}, [t], #2*16 // loading up coefficients for adler/sum2 computation
ldr d7, [t] // for MOD operation
b.lt L_len_lessthan_NMAX // if (len < NMAX) skip the while loop
sub len, len, nmax // pre-decrement len by NMAX
L_while_len_ge_NMAX_loop: // while (len>=NMAX) {
// 5552/16 = 173*2 + 1, need 1 DO16 + 173 DO32
.macro DO16
ld1.4s {v2},[buf] // 16 bytes input
shl d6, adlersum2, #(4+32) // adler * 16
umull.8h v4, v2, v1 // 16*x0, 15*x1, ..., 9*x7
add adlersum2, adlersum2, d6 // sum2 += adler * 16
umlal2.8h v4, v2, v1 // 8*x8, 7*x9, ..., 1*x15
uaddlv h2, v2.16b // x0+x1+x2+...+x15
uaddlv s4, v4.8h // 16*x0 + 15*x1 + ... + 1*x15
zip1.2s v4, v2, v4
add buf, buf, #16
add.2s vadlersum2, vadlersum2, v4
.endm
.macro DO32
ld1.4s {v2,v3},[buf] // 32 bytes input
shl d6, adlersum2, #(5+32) // adler * 32
umull.8h v4, v2, v0 // 32*x0, 31*x1, ..., 25*x7
add adlersum2, adlersum2, d6 // sum2 += adler * 32
umlal2.8h v4, v2, v0 // accumulate 24*x8, 23*x9, ..., 17*x15
uaddlv h2, v2.16b // x0+x1+x2+...+x15 and extend from byte to short
umlal.8h v4, v3, v1 // accumulate 16*x16, 15*x17, ..., 9*x23
uaddlv h6, v3.16b // x16+x17+x18+...+x31 and extend from byte to short
umlal2.8h v4, v3, v1 // accumulate 8*x24, 7*x25, ..., 1*x31
uaddl.4s v2, v2, v6 // x0+x1+...+x31 and extend from short to int
uaddlv s4, v4.8h // 32*x0 + 31*x1 + ... + 1*x31
zip1.2s v4, v2, v4
add buf, buf, #32
add.2s vadlersum2, vadlersum2, v4
.endm
.macro MOD_BASE
umull.2d v2,vadlersum2,v7[1]
ushr.2d v2, v2, #47
ins v2.s[1], v2.s[2]
mls.2s vadlersum2,v2,v7[0]
.endm
DO16
mov nvec, #173 // number of DO32 loops
0:
DO32
subs nvec, nvec, #1
b.gt 0b
MOD_BASE // MOD(adler,sum2);
subs len, len, nmax // pre-decrement len by NMAX
b.ge L_while_len_ge_NMAX_loop // repeat while len >= NMAX
adds len, len, nmax // post-increment len by NMAX
L_len_lessthan_NMAX:
subs len, len, #32 // pre-decrement len by 32
b.lt L_len_lessthan_32 // if len < 32, branch to len16_loop
L_len32_loop:
DO32
subs len, len, #32 // len -= 32;
b.ge L_len32_loop
L_len_lessthan_32:
tst len, #16
b.eq 1f
DO16
1:
ands len, len, #15
b.eq L_len_is_zero
L_remaining_len_loop:
ldrb w0, [buf], #1 // *buf++;
sub len, len, #1
ins v2.d[0], x0
add adlersum2, adlersum2, d2 // adler += *buf
shl d6, adlersum2, #32 // shift adler up to sum2 position
add adlersum2, adlersum2, d6 // sum2 += adler
cbnz len, L_remaining_len_loop // break if len<=0
L_len_is_zero:
MOD_BASE // mod(alder,BASE); mod(sum2,BASE);
umov w0, vadlersum2.s[0] // adler
umov w1, vadlersum2.s[1] // sum2
add w0, w0, w1, lsl #16 // to return adler | (sum2 << 16);
#if KERNEL
ld1.4s {v0,v1,v2,v3},[sp],#4*16
ld1.4s {v4,v5,v6,v7},[sp],#4*16
#endif
ret lr
// constants to be loaded into q registers
.align 4 // 16 byte aligned
vec_table:
// coefficients for computing sum2
.long 0x1d1e1f20 // s0
.long 0x191a1b1c // s1
.long 0x15161718 // s2
.long 0x11121314 // s3
.long 0x0d0e0f10 // s0
.long 0x090a0b0c // s1
.long 0x05060708 // s2
.long 0x01020304 // s3
.long BASE // s6 : BASE
.long 0x80078071 // s7 : 1/BASE in Q47
#endif // __arm64__