aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

The idct32x32 function actually pushed d8-d15 onto the stack even
though it didn't clobber them; there are plenty of registers that
can be used to allow keeping all the idct coefficients in registers
without having to reload different subsets of them at different
stages in the transform.

After this, we still can skip pushing d12-d15.

Before:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3
After:
vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2017-01-02 22:08:41 +02:00
parent 402546a172
commit 65aa002d54
1 changed files with 43 additions and 67 deletions

View File

@ -1123,18 +1123,14 @@ endfunc
.endm
function idct32_odd
ld1 {v0.8h,v1.8h}, [x11]
dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
ld1 {v0.8h}, [x10]
dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
@ -1153,18 +1149,14 @@ function idct32_odd
endfunc
function idct32_odd_half
ld1 {v0.8h,v1.8h}, [x11]
dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
ld1 {v0.8h}, [x10]
dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
@ -1183,18 +1175,14 @@ function idct32_odd_half
endfunc
function idct32_odd_quarter
ld1 {v0.8h,v1.8h}, [x11]
dsmull_h v4, v5, v16, v0.h[0]
dsmull_h v28, v29, v19, v0.h[7]
dsmull_h v30, v31, v16, v0.h[1]
dsmull_h v22, v23, v17, v1.h[6]
dsmull_h v7, v6, v17, v1.h[7]
dsmull_h v26, v27, v19, v0.h[6]
dsmull_h v20, v21, v18, v1.h[0]
dsmull_h v24, v25, v18, v1.h[1]
ld1 {v0.8h}, [x10]
dsmull_h v4, v5, v16, v8.h[0]
dsmull_h v28, v29, v19, v8.h[7]
dsmull_h v30, v31, v16, v8.h[1]
dsmull_h v22, v23, v17, v9.h[6]
dsmull_h v7, v6, v17, v9.h[7]
dsmull_h v26, v27, v19, v8.h[6]
dsmull_h v20, v21, v18, v9.h[0]
dsmull_h v24, v25, v18, v9.h[1]
neg v28.4s, v28.4s
neg v29.4s, v29.4s
@ -1240,12 +1228,8 @@ endfunc
// x1 = unused
// x2 = src
// x9 = double input stride
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass1\suffix\()_neon
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@ -1278,14 +1262,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
.macro store_rev a, b
// There's no rev128 instruction, but we reverse each 64 bit
// half, and then flip them using an ext with 8 bytes offset.
rev64 v1.8h, \b
rev64 v3.8h, \b
st1 {\a}, [x0], #16
rev64 v0.8h, \a
ext v1.16b, v1.16b, v1.16b, #8
rev64 v2.8h, \a
ext v3.16b, v3.16b, v3.16b, #8
st1 {\b}, [x0], #16
ext v0.16b, v0.16b, v0.16b, #8
st1 {v1.8h}, [x0], #16
st1 {v0.8h}, [x0], #16
ext v2.16b, v2.16b, v2.16b, #8
st1 {v3.8h}, [x0], #16
st1 {v2.8h}, [x0], #16
.endm
store_rev v16.8h, v24.8h
store_rev v17.8h, v25.8h
@ -1339,20 +1323,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
// subtracted from the output.
.macro store_rev a, b
ld1 {v4.8h}, [x0]
rev64 v1.8h, \b
rev64 v3.8h, \b
add v4.8h, v4.8h, \a
rev64 v0.8h, \a
rev64 v2.8h, \a
st1 {v4.8h}, [x0], #16
ext v1.16b, v1.16b, v1.16b, #8
ext v3.16b, v3.16b, v3.16b, #8
ld1 {v5.8h}, [x0]
ext v0.16b, v0.16b, v0.16b, #8
ext v2.16b, v2.16b, v2.16b, #8
add v5.8h, v5.8h, \b
st1 {v5.8h}, [x0], #16
ld1 {v6.8h}, [x0]
sub v6.8h, v6.8h, v1.8h
sub v6.8h, v6.8h, v3.8h
st1 {v6.8h}, [x0], #16
ld1 {v7.8h}, [x0]
sub v7.8h, v7.8h, v0.8h
sub v7.8h, v7.8h, v2.8h
st1 {v7.8h}, [x0], #16
.endm
@ -1376,12 +1360,8 @@ endfunc
// x2 = src (temp buffer)
// x7 = negative double temp buffer stride
// x9 = double temp buffer stride
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass2\suffix\()_neon
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -1454,15 +1434,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
sub v6.8h, v6.8h, \c
sub v7.8h, v7.8h, \d
.endif
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v10.8b}, [x0], x1
ld1 {v11.8b}, [x0], x1
srshr v4.8h, v4.8h, #6
ld1 {v2.8b}, [x0], x1
srshr v5.8h, v5.8h, #6
uaddw v4.8h, v4.8h, v0.8b
uaddw v4.8h, v4.8h, v10.8b
ld1 {v3.8b}, [x0], x1
srshr v6.8h, v6.8h, #6
uaddw v5.8h, v5.8h, v1.8b
uaddw v5.8h, v5.8h, v11.8b
srshr v7.8h, v7.8h, #6
sub x0, x0, x1, lsl #2
uaddw v6.8h, v6.8h, v2.8b
@ -1503,13 +1483,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
add x11, x10, #32
movrel x12, min_eob_idct_idct_32, 2
mov x15, x30
stp d14, d15, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
@ -1523,6 +1500,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
mov x9, #128
neg x7, x9
ld1 {v0.8h,v1.8h}, [x10], #32
ld1 {v8.8h,v9.8h}, [x10]
cmp w3, #34
b.le idct32x32_quarter_add_neon
cmp w3, #135
@ -1565,8 +1545,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
br x15
endfunc
@ -1592,8 +1570,6 @@ function idct32x32_\size\()_add_neon
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
br x15
endfunc