319 lines
8.2 KiB
ArmAsm
319 lines
8.2 KiB
ArmAsm
/*
|
|
* Copyright (C) 2013 Apple Inc. All rights reserved.
|
|
*
|
|
* This document is the property of Apple Computer, Inc.
|
|
* It is considered confidential and proprietary.
|
|
*
|
|
* This document may not be reproduced or transmitted in any form,
|
|
* in whole or in part, without the express written permission of
|
|
* Apple Inc.
|
|
*/
|
|
|
|
#if defined(__arm64__)
|
|
|
|
#define BASE 65521 /* largest prime smaller than 65536 */
|
|
#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
|
|
|
// Note: buf should have been 16-byte aligned in the caller function,
|
|
|
|
// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
|
|
// unsigned n;
|
|
// while (len >= NMAX) {
|
|
// len -= NMAX;
|
|
// n = NMAX / 16; /* NMAX is divisible by 16 */
|
|
// do {
|
|
// DO16(buf); /* 16 sums unrolled */
|
|
// buf += 16;
|
|
// } while (--n);
|
|
// MOD(adler);
|
|
// MOD(sum2);
|
|
// }
|
|
// if (len) { /* avoid modulos if none remaining */
|
|
// while (len >= 16) {
|
|
// len -= 16;
|
|
// DO16(buf);
|
|
// buf += 16;
|
|
// }
|
|
// while (len--) {
|
|
// adler += *buf++;
|
|
// sum2 += adler;
|
|
// }
|
|
// MOD(adler);
|
|
// MOD(sum2);
|
|
// }
|
|
// return adler | (sum2 << 16); /* return recombined sums */
|
|
// }
|
|
|
|
|
|
/*
|
|
DO16 vectorization:
|
|
given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
|
|
sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
|
|
adler += (x[0] + x[1] + ... + x[15]);
|
|
|
|
similarly, 2 DO16 = DO32 vectorization:
|
|
given initial unsigned int sum2 and adler, and a new set of 32 input bytes (x[0:31]), it can be shown that
|
|
sum2 += (32*adler + 32*x[0] + 31*x[1] + ... + 1*x[31]);
|
|
adler += (x[0] + x[1] + ... + x[31]);
|
|
|
|
NMAX = 5552
|
|
NMAX/16 = 347 = 173*2 + 1
|
|
|
|
therefore, for a block of 5552 bytes
|
|
|
|
n = 173;
|
|
do {
|
|
DO32(buf); buf+=32;
|
|
} while (n--);
|
|
|
|
DO16(buf); buf+=16;
|
|
MOD(adler);
|
|
MOD(sum2);
|
|
|
|
for a residual remaining len bytes,
|
|
while (len >= 32) {
|
|
DO32(buf); buf += 32; len -= 32;
|
|
}
|
|
if (len>=16) {
|
|
DO16(buf); buf += 16; len -= 16;
|
|
}
|
|
while (len--) {
|
|
adler += *buf++;
|
|
sum2 += adler;
|
|
}
|
|
MOD(adler);
|
|
MOD(sum2);
|
|
|
|
|
|
|
|
|
|
DO32:
|
|
pack sum2:adler in a 64-bit register
|
|
|
|
0. sum2 += adler*32;
|
|
|
|
sum2:adler += (sum2:adler) << (32+5);
|
|
|
|
1. (32, 31, ..., 1) * ( x0, x1, ..., x31) (v2, v3) * (v0, v1)
|
|
umull.8h v4, v2, v0
|
|
umull2.8h v5, v2, v0
|
|
umull.8h v6, v3, v1
|
|
umull2.8h v7, v3, v1
|
|
add.8h v4, v4, v5
|
|
add.8h v6, v6, v7
|
|
add.8h v4, v4, v6
|
|
uaddlv s4, v4.8h
|
|
|
|
add sum2, sum2, s4
|
|
|
|
2. adler += (x0 + x1 + x2 + ... + x31)
|
|
|
|
uaddlp v0.8h, v0.16b
|
|
uaddlp v1.8h, v1.16b
|
|
uaddlv s0, v0.8h
|
|
uaddlv s1, v1.8h
|
|
add s0, s0, s1
|
|
|
|
add adler, adler, s0
|
|
|
|
|
|
therefore, this is what can be done to vectorize the above computation
|
|
1. 16-byte aligned vector load into q2 (x[0:x15])
|
|
2. sum2 += (adler<<32); // pack adler/sum2 into a 64-bit word
|
|
3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
|
|
4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
|
|
5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
|
|
|
|
In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
|
|
1. 32-byte aligned vector load into q2,q14 (x[0:x31])
|
|
2. sum2 += (adler<<32);
|
|
3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
|
|
4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
|
|
5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
|
|
|
|
This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
|
|
|
|
*/
|
|
|
|
/*
|
|
MOD implementation:
|
|
adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
|
|
1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47
|
|
2. vshr.u64 q2,q2,#47 // floor function
|
|
3. vpadd.u32 d4,d4,d5 // merge into a double word in d4
|
|
4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
|
|
|
|
*/
|
|
|
|
|
|
.text
|
|
.align 2
|
|
.globl _adler32_vec
|
|
_adler32_vec:
|
|
|
|
|
|
|
|
#define adler w0
|
|
#define sum2 w1
|
|
#define buf x2
|
|
#define len w3
|
|
#define nmax w4
|
|
#define nvec w6 // vecs = NMAX/16
|
|
|
|
#define t x7
|
|
|
|
#define vadlersum2 v5 // only lower part
|
|
#define adlersum2 d5 // only lower part
|
|
|
|
add x0, x0, x1, lsl #32
|
|
adrp t, vec_table@PAGE
|
|
mov nmax, #NMAX // NMAX
|
|
add t, t, vec_table@PAGEOFF
|
|
ins vadlersum2.d[0], x0
|
|
|
|
|
|
#if KERNEL
|
|
sub x6, sp, #8*16
|
|
sub sp, sp, #8*16
|
|
st1.4s {v0,v1,v2,v3},[x6],#4*16
|
|
st1.4s {v4,v5,v6,v7},[x6],#4*16
|
|
#endif
|
|
|
|
cmp len, nmax // len vs NMAX
|
|
ld1.4s {v0, v1}, [t], #2*16 // loading up coefficients for adler/sum2 computation
|
|
ldr d7, [t] // for MOD operation
|
|
b.lt L_len_lessthan_NMAX // if (len < NMAX) skip the while loop
|
|
|
|
sub len, len, nmax // pre-decrement len by NMAX
|
|
|
|
L_while_len_ge_NMAX_loop: // while (len>=NMAX) {
|
|
|
|
|
|
// 5552/16 = 173*2 + 1, need 1 DO16 + 173 DO32
|
|
|
|
.macro DO16
|
|
ld1.4s {v2},[buf] // 16 bytes input
|
|
shl d6, adlersum2, #(4+32) // adler * 16
|
|
umull.8h v4, v2, v1 // 16*x0, 15*x1, ..., 9*x7
|
|
add adlersum2, adlersum2, d6 // sum2 += adler * 16
|
|
umlal2.8h v4, v2, v1 // 8*x8, 7*x9, ..., 1*x15
|
|
uaddlv h2, v2.16b // x0+x1+x2+...+x15
|
|
uaddlv s4, v4.8h // 16*x0 + 15*x1 + ... + 1*x15
|
|
zip1.2s v4, v2, v4
|
|
add buf, buf, #16
|
|
add.2s vadlersum2, vadlersum2, v4
|
|
.endm
|
|
|
|
.macro DO32
|
|
ld1.4s {v2,v3},[buf] // 32 bytes input
|
|
shl d6, adlersum2, #(5+32) // adler * 32
|
|
umull.8h v4, v2, v0 // 32*x0, 31*x1, ..., 25*x7
|
|
add adlersum2, adlersum2, d6 // sum2 += adler * 32
|
|
umlal2.8h v4, v2, v0 // accumulate 24*x8, 23*x9, ..., 17*x15
|
|
uaddlv h2, v2.16b // x0+x1+x2+...+x15 and extend from byte to short
|
|
umlal.8h v4, v3, v1 // accumulate 16*x16, 15*x17, ..., 9*x23
|
|
uaddlv h6, v3.16b // x16+x17+x18+...+x31 and extend from byte to short
|
|
umlal2.8h v4, v3, v1 // accumulate 8*x24, 7*x25, ..., 1*x31
|
|
uaddl.4s v2, v2, v6 // x0+x1+...+x31 and extend from short to int
|
|
uaddlv s4, v4.8h // 32*x0 + 31*x1 + ... + 1*x31
|
|
zip1.2s v4, v2, v4
|
|
add buf, buf, #32
|
|
add.2s vadlersum2, vadlersum2, v4
|
|
.endm
|
|
|
|
.macro MOD_BASE
|
|
umull.2d v2,vadlersum2,v7[1]
|
|
ushr.2d v2, v2, #47
|
|
ins v2.s[1], v2.s[2]
|
|
mls.2s vadlersum2,v2,v7[0]
|
|
.endm
|
|
|
|
DO16
|
|
mov nvec, #173 // number of DO32 loops
|
|
|
|
0:
|
|
DO32
|
|
|
|
subs nvec, nvec, #1
|
|
b.gt 0b
|
|
|
|
MOD_BASE // MOD(adler,sum2);
|
|
|
|
subs len, len, nmax // pre-decrement len by NMAX
|
|
b.ge L_while_len_ge_NMAX_loop // repeat while len >= NMAX
|
|
|
|
adds len, len, nmax // post-increment len by NMAX
|
|
|
|
L_len_lessthan_NMAX:
|
|
|
|
subs len, len, #32 // pre-decrement len by 32
|
|
b.lt L_len_lessthan_32 // if len < 32, branch to len16_loop
|
|
|
|
L_len32_loop:
|
|
|
|
DO32
|
|
|
|
subs len, len, #32 // len -= 32;
|
|
b.ge L_len32_loop
|
|
|
|
L_len_lessthan_32:
|
|
|
|
tst len, #16
|
|
b.eq 1f
|
|
|
|
DO16
|
|
|
|
1:
|
|
ands len, len, #15
|
|
b.eq L_len_is_zero
|
|
|
|
|
|
L_remaining_len_loop:
|
|
ldrb w0, [buf], #1 // *buf++;
|
|
sub len, len, #1
|
|
ins v2.d[0], x0
|
|
|
|
add adlersum2, adlersum2, d2 // adler += *buf
|
|
shl d6, adlersum2, #32 // shift adler up to sum2 position
|
|
add adlersum2, adlersum2, d6 // sum2 += adler
|
|
cbnz len, L_remaining_len_loop // break if len<=0
|
|
|
|
|
|
L_len_is_zero:
|
|
|
|
|
|
MOD_BASE // mod(alder,BASE); mod(sum2,BASE);
|
|
|
|
umov w0, vadlersum2.s[0] // adler
|
|
umov w1, vadlersum2.s[1] // sum2
|
|
add w0, w0, w1, lsl #16 // to return adler | (sum2 << 16);
|
|
|
|
#if KERNEL
|
|
ld1.4s {v0,v1,v2,v3},[sp],#4*16
|
|
ld1.4s {v4,v5,v6,v7},[sp],#4*16
|
|
#endif
|
|
|
|
ret lr
|
|
|
|
// constants to be loaded into q registers
|
|
.align 4 // 16 byte aligned
|
|
|
|
vec_table:
|
|
|
|
// coefficients for computing sum2
|
|
.long 0x1d1e1f20 // s0
|
|
.long 0x191a1b1c // s1
|
|
.long 0x15161718 // s2
|
|
.long 0x11121314 // s3
|
|
.long 0x0d0e0f10 // s0
|
|
.long 0x090a0b0c // s1
|
|
.long 0x05060708 // s2
|
|
.long 0x01020304 // s3
|
|
|
|
.long BASE // s6 : BASE
|
|
.long 0x80078071 // s7 : 1/BASE in Q47
|
|
|
|
|
|
|
|
#endif // __arm64__
|