aarch64: Consistently use lowercase for vector element specifiers

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2023-10-17 14:16:24 +03:00
parent 393d1ee541
commit 184103b310
16 changed files with 2199 additions and 2199 deletions

View File

@ -19,82 +19,82 @@
#include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1
1: ld1 {v0.4S,v1.4S}, [x1], #32
fmul v0.4S, v0.4S, v0.4S
fmul v1.4S, v1.4S, v1.4S
faddp v2.4S, v0.4S, v1.4S
ld1 {v3.4S}, [x0]
fadd v3.4S, v3.4S, v2.4S
st1 {v3.4S}, [x0], #16
1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4s, v0.4s, v0.4s
fmul v1.4s, v1.4s, v1.4s
faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4s}, [x0]
fadd v3.4s, v3.4s, v2.4s
st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
ret
endfunc
function ff_ps_mul_pair_single_neon, export=1
1: ld1 {v0.4S,v1.4S}, [x1], #32
ld1 {v2.4S}, [x2], #16
zip1 v3.4S, v2.4S, v2.4S
zip2 v4.4S, v2.4S, v2.4S
fmul v0.4S, v0.4S, v3.4S
fmul v1.4S, v1.4S, v4.4S
st1 {v0.4S,v1.4S}, [x0], #32
1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4s}, [x2], #16
zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4s, v2.4s, v2.4s
fmul v0.4s, v0.4s, v3.4s
fmul v1.4s, v1.4s, v4.4s
st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_neon, export=1
ld1 {v0.4S}, [x2]
ld1 {v1.4S}, [x3]
zip1 v4.4S, v0.4S, v0.4S
zip2 v5.4S, v0.4S, v0.4S
zip1 v6.4S, v1.4S, v1.4S
zip2 v7.4S, v1.4S, v1.4S
1: ld1 {v2.2S}, [x0]
ld1 {v3.2S}, [x1]
fadd v4.4S, v4.4S, v6.4S
fadd v5.4S, v5.4S, v7.4S
mov v2.D[1], v2.D[0]
mov v3.D[1], v3.D[0]
fmul v2.4S, v2.4S, v4.4S
fmla v2.4S, v3.4S, v5.4S
st1 {v2.D}[0], [x0], #8
st1 {v2.D}[1], [x1], #8
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x3]
zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v2.4s, v2.4s, v4.4s
fmla v2.4s, v3.4s, v5.4s
st1 {v2.d}[0], [x0], #8
st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
ld1 {v0.4S,v1.4S}, [x2]
ld1 {v6.4S,v7.4S}, [x3]
fneg v2.4S, v1.4S
fneg v3.4S, v7.4S
zip1 v16.4S, v0.4S, v0.4S
zip2 v17.4S, v0.4S, v0.4S
zip1 v18.4S, v2.4S, v1.4S
zip2 v19.4S, v2.4S, v1.4S
zip1 v20.4S, v6.4S, v6.4S
zip2 v21.4S, v6.4S, v6.4S
zip1 v22.4S, v3.4S, v7.4S
zip2 v23.4S, v3.4S, v7.4S
1: ld1 {v2.2S}, [x0]
ld1 {v3.2S}, [x1]
fadd v16.4S, v16.4S, v20.4S
fadd v17.4S, v17.4S, v21.4S
mov v2.D[1], v2.D[0]
mov v3.D[1], v3.D[0]
fmul v4.4S, v2.4S, v16.4S
fmla v4.4S, v3.4S, v17.4S
fadd v18.4S, v18.4S, v22.4S
fadd v19.4S, v19.4S, v23.4S
ext v2.16B, v2.16B, v2.16B, #4
ext v3.16B, v3.16B, v3.16B, #4
fmla v4.4S, v2.4S, v18.4S
fmla v4.4S, v3.4S, v19.4S
st1 {v4.D}[0], [x0], #8
st1 {v4.D}[1], [x1], #8
ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4s, v1.4s
fneg v3.4s, v7.4s
zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v16.4s, v16.4s, v20.4s
fadd v17.4s, v17.4s, v21.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v4.4s, v2.4s, v16.4s
fmla v4.4s, v3.4s, v17.4s
fadd v18.4s, v18.4s, v22.4s
fadd v19.4s, v19.4s, v23.4s
ext v2.16b, v2.16b, v2.16b, #4
ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4s, v2.4s, v18.4s
fmla v4.4s, v3.4s, v19.4s
st1 {v4.d}[0], [x0], #8
st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
@ -102,46 +102,46 @@ endfunc
function ff_ps_hybrid_analysis_neon, export=1
lsl x3, x3, #3
ld2 {v0.4S,v1.4S}, [x1], #32
ld2 {v2.2S,v3.2S}, [x1], #16
ld1 {v24.2S}, [x1], #8
ld2 {v4.2S,v5.2S}, [x1], #16
ld2 {v6.4S,v7.4S}, [x1]
rev64 v6.4S, v6.4S
rev64 v7.4S, v7.4S
ext v6.16B, v6.16B, v6.16B, #8
ext v7.16B, v7.16B, v7.16B, #8
rev64 v4.2S, v4.2S
rev64 v5.2S, v5.2S
mov v2.D[1], v3.D[0]
mov v4.D[1], v5.D[0]
mov v5.D[1], v2.D[0]
mov v3.D[1], v4.D[0]
fadd v16.4S, v0.4S, v6.4S
fadd v17.4S, v1.4S, v7.4S
fsub v18.4S, v1.4S, v7.4S
fsub v19.4S, v0.4S, v6.4S
fadd v22.4S, v2.4S, v4.4S
fsub v23.4S, v5.4S, v3.4S
trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4S,v3.4S}, [x2], #32
ld2 {v4.2S,v5.2S}, [x2], #16
ld1 {v6.2S}, [x2], #8
ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2s}, [x1], #8
ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4s, v6.4s
rev64 v7.4s, v7.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2s, v4.2s
rev64 v5.2s, v5.2s
mov v2.d[1], v3.d[0]
mov v4.d[1], v5.d[0]
mov v5.d[1], v2.d[0]
mov v3.d[1], v4.d[0]
fadd v16.4s, v0.4s, v6.4s
fadd v17.4s, v1.4s, v7.4s
fsub v18.4s, v1.4s, v7.4s
fsub v19.4s, v0.4s, v6.4s
fadd v22.4s, v2.4s, v4.4s
fsub v23.4s, v5.4s, v3.4s
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2s}, [x2], #8
add x2, x2, #8
mov v4.D[1], v5.D[0]
mov v6.S[1], v6.S[0]
fmul v6.2S, v6.2S, v24.2S
fmul v0.4S, v2.4S, v16.4S
fmul v1.4S, v2.4S, v17.4S
fmls v0.4S, v3.4S, v18.4S
fmla v1.4S, v3.4S, v19.4S
fmla v0.4S, v4.4S, v20.4S
fmla v1.4S, v4.4S, v21.4S
faddp v0.4S, v0.4S, v1.4S
faddp v0.4S, v0.4S, v0.4S
fadd v0.2S, v0.2S, v6.2S
st1 {v0.2S}, [x0], x3
mov v4.d[1], v5.d[0]
mov v6.s[1], v6.s[0]
fmul v6.2s, v6.2s, v24.2s
fmul v0.4s, v2.4s, v16.4s
fmul v1.4s, v2.4s, v17.4s
fmls v0.4s, v3.4s, v18.4s
fmla v1.4s, v3.4s, v19.4s
fmla v0.4s, v4.4s, v20.4s
fmla v1.4s, v4.4s, v21.4s
faddp v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
fadd v0.2s, v0.2s, v6.2s
st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
ret

View File

@ -39,10 +39,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6]
ld1r {v22.8h}, [x6]
.endif
.ifc \codec,vc1
movi v22.8H, #28
movi v22.8h, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
@ -55,139 +55,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
add w4, w4, #64
b.eq 2f
dup v0.8B, w4
dup v1.8B, w12
ld1 {v4.8B, v5.8B}, [x1], x2
dup v2.8B, w6
dup v3.8B, w7
ext v5.8B, v4.8B, v5.8B, #1
1: ld1 {v6.8B, v7.8B}, [x1], x2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
ext v7.8B, v6.8B, v7.8B, #1
ld1 {v4.8B, v5.8B}, [x1], x2
umlal v16.8H, v6.8B, v2.8B
dup v0.8b, w4
dup v1.8b, w12
ld1 {v4.8b, v5.8b}, [x1], x2
dup v2.8b, w6
dup v3.8b, w7
ext v5.8b, v4.8b, v5.8b, #1
1: ld1 {v6.8b, v7.8b}, [x1], x2
umull v16.8h, v4.8b, v0.8b
umlal v16.8h, v5.8b, v1.8b
ext v7.8b, v6.8b, v7.8b, #1
ld1 {v4.8b, v5.8b}, [x1], x2
umlal v16.8h, v6.8b, v2.8b
prfm pldl1strm, [x1]
ext v5.8B, v4.8B, v5.8B, #1
umlal v16.8H, v7.8B, v3.8B
umull v17.8H, v6.8B, v0.8B
ext v5.8b, v4.8b, v5.8b, #1
umlal v16.8h, v7.8b, v3.8b
umull v17.8h, v6.8b, v0.8b
subs w3, w3, #2
umlal v17.8H, v7.8B, v1.8B
umlal v17.8H, v4.8B, v2.8B
umlal v17.8H, v5.8B, v3.8B
umlal v17.8h, v7.8b, v1.8b
umlal v17.8h, v4.8b, v2.8b
umlal v17.8h, v5.8b, v3.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
rshrn v16.8b, v16.8h, #6
rshrn v17.8b, v17.8h, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
add v16.8h, v16.8h, v22.8h
add v17.8h, v17.8h, v22.8h
shrn v16.8b, v16.8h, #6
shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
ld1 {v20.8b}, [x8], x2
ld1 {v21.8b}, [x8], x2
urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8b, v17.8b, v21.8b
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
dup v0.8B, w4
dup v0.8b, w4
b.eq 5f
tst w6, w6
dup v1.8B, w12
dup v1.8b, w12
b.eq 4f
ld1 {v4.8B}, [x1], x2
3: ld1 {v6.8B}, [x1], x2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v6.8B, v1.8B
ld1 {v4.8B}, [x1], x2
umull v17.8H, v6.8B, v0.8B
umlal v17.8H, v4.8B, v1.8B
ld1 {v4.8b}, [x1], x2
3: ld1 {v6.8b}, [x1], x2
umull v16.8h, v4.8b, v0.8b
umlal v16.8h, v6.8b, v1.8b
ld1 {v4.8b}, [x1], x2
umull v17.8h, v6.8b, v0.8b
umlal v17.8h, v4.8b, v1.8b
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
rshrn v16.8b, v16.8h, #6
rshrn v17.8b, v17.8h, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
add v16.8h, v16.8h, v22.8h
add v17.8h, v17.8h, v22.8h
shrn v16.8b, v16.8h, #6
shrn v17.8b, v17.8h, #6
.endif
prfm pldl1strm, [x1, x2]
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
ld1 {v20.8b}, [x8], x2
ld1 {v21.8b}, [x8], x2
urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8b, v17.8b, v21.8b
.endif
subs w3, w3, #2
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
b.gt 3b
ret
4: ld1 {v4.8B, v5.8B}, [x1], x2
ld1 {v6.8B, v7.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
ext v7.8B, v6.8B, v7.8B, #1
4: ld1 {v4.8b, v5.8b}, [x1], x2
ld1 {v6.8b, v7.8b}, [x1], x2
ext v5.8b, v4.8b, v5.8b, #1
ext v7.8b, v6.8b, v7.8b, #1
prfm pldl1strm, [x1]
subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
umull v17.8H, v6.8B, v0.8B
umlal v17.8H, v7.8B, v1.8B
umull v16.8h, v4.8b, v0.8b
umlal v16.8h, v5.8b, v1.8b
umull v17.8h, v6.8b, v0.8b
umlal v17.8h, v7.8b, v1.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
rshrn v16.8b, v16.8h, #6
rshrn v17.8b, v17.8h, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
add v16.8h, v16.8h, v22.8h
add v17.8h, v17.8h, v22.8h
shrn v16.8b, v16.8h, #6
shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
ld1 {v20.8b}, [x8], x2
ld1 {v21.8b}, [x8], x2
urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8b, v17.8b, v21.8b
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
b.gt 4b
ret
5: ld1 {v4.8B}, [x1], x2
ld1 {v5.8B}, [x1], x2
5: ld1 {v4.8b}, [x1], x2
ld1 {v5.8b}, [x1], x2
prfm pldl1strm, [x1]
subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B
umull v17.8H, v5.8B, v0.8B
umull v16.8h, v4.8b, v0.8b
umull v17.8h, v5.8b, v0.8b
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
rshrn v16.8b, v16.8h, #6
rshrn v17.8b, v17.8h, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
add v16.8h, v16.8h, v22.8h
add v17.8h, v17.8h, v22.8h
shrn v16.8b, v16.8h, #6
shrn v17.8b, v17.8h, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
ld1 {v20.8b}, [x8], x2
ld1 {v21.8b}, [x8], x2
urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8b, v17.8b, v21.8b
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
b.gt 5b
ret
endfunc
@ -209,10 +209,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6]
ld1r {v22.8h}, [x6]
.endif
.ifc \codec,vc1
movi v22.8H, #28
movi v22.8h, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
@ -225,133 +225,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
add w4, w4, #64
b.eq 2f
dup v24.8B, w4
dup v25.8B, w12
ld1 {v4.8B}, [x1], x2
dup v26.8B, w6
dup v27.8B, w7
ext v5.8B, v4.8B, v5.8B, #1
trn1 v0.2S, v24.2S, v25.2S
trn1 v2.2S, v26.2S, v27.2S
trn1 v4.2S, v4.2S, v5.2S
1: ld1 {v6.8B}, [x1], x2
ext v7.8B, v6.8B, v7.8B, #1
trn1 v6.2S, v6.2S, v7.2S
umull v18.8H, v4.8B, v0.8B
umlal v18.8H, v6.8B, v2.8B
ld1 {v4.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
trn1 v4.2S, v4.2S, v5.2S
dup v24.8b, w4
dup v25.8b, w12
ld1 {v4.8b}, [x1], x2
dup v26.8b, w6
dup v27.8b, w7
ext v5.8b, v4.8b, v5.8b, #1
trn1 v0.2s, v24.2s, v25.2s
trn1 v2.2s, v26.2s, v27.2s
trn1 v4.2s, v4.2s, v5.2s
1: ld1 {v6.8b}, [x1], x2
ext v7.8b, v6.8b, v7.8b, #1
trn1 v6.2s, v6.2s, v7.2s
umull v18.8h, v4.8b, v0.8b
umlal v18.8h, v6.8b, v2.8b
ld1 {v4.8b}, [x1], x2
ext v5.8b, v4.8b, v5.8b, #1
trn1 v4.2s, v4.2s, v5.2s
prfm pldl1strm, [x1]
umull v19.8H, v6.8B, v0.8B
umlal v19.8H, v4.8B, v2.8B
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
umull v19.8h, v6.8b, v0.8b
umlal v19.8h, v4.8b, v2.8b
trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2d, v18.2d, v19.2d
add v18.8h, v30.8h, v31.8h
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
rshrn v16.8b, v18.8h, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
add v18.8h, v18.8h, v22.8h
shrn v16.8b, v18.8h, #6
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
ld1 {v20.s}[0], [x8], x2
ld1 {v20.s}[1], [x8], x2
urhadd v16.8b, v16.8b, v20.8b
.endif
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
st1 {v16.s}[0], [x0], x2
st1 {v16.s}[1], [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
dup v30.8B, w4
dup v30.8b, w4
b.eq 5f
tst w6, w6
dup v31.8B, w12
trn1 v0.2S, v30.2S, v31.2S
trn2 v1.2S, v30.2S, v31.2S
dup v31.8b, w12
trn1 v0.2s, v30.2s, v31.2s
trn2 v1.2s, v30.2s, v31.2s
b.eq 4f
ext v1.8B, v0.8B, v1.8B, #4
ld1 {v4.S}[0], [x1], x2
3: ld1 {v4.S}[1], [x1], x2
umull v18.8H, v4.8B, v0.8B
ld1 {v4.S}[0], [x1], x2
umull v19.8H, v4.8B, v1.8B
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
ext v1.8b, v0.8b, v1.8b, #4
ld1 {v4.s}[0], [x1], x2
3: ld1 {v4.s}[1], [x1], x2
umull v18.8h, v4.8b, v0.8b
ld1 {v4.s}[0], [x1], x2
umull v19.8h, v4.8b, v1.8b
trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2d, v18.2d, v19.2d
add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
rshrn v16.8b, v18.8h, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
add v18.8h, v18.8h, v22.8h
shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
ld1 {v20.s}[0], [x8], x2
ld1 {v20.s}[1], [x8], x2
urhadd v16.8b, v16.8b, v20.8b
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
st1 {v16.s}[0], [x0], x2
st1 {v16.s}[1], [x0], x2
b.gt 3b
ret
4: ld1 {v4.8B}, [x1], x2
ld1 {v6.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
ext v7.8B, v6.8B, v7.8B, #1
trn1 v4.2S, v4.2S, v5.2S
trn1 v6.2S, v6.2S, v7.2S
umull v18.8H, v4.8B, v0.8B
umull v19.8H, v6.8B, v0.8B
4: ld1 {v4.8b}, [x1], x2
ld1 {v6.8b}, [x1], x2
ext v5.8b, v4.8b, v5.8b, #1
ext v7.8b, v6.8b, v7.8b, #1
trn1 v4.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s
umull v18.8h, v4.8b, v0.8b
umull v19.8h, v6.8b, v0.8b
subs w3, w3, #2
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2d, v18.2d, v19.2d
add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
rshrn v16.8b, v18.8h, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
add v18.8h, v18.8h, v22.8h
shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
ld1 {v20.s}[0], [x8], x2
ld1 {v20.s}[1], [x8], x2
urhadd v16.8b, v16.8b, v20.8b
.endif
prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
st1 {v16.s}[0], [x0], x2
st1 {v16.s}[1], [x0], x2
b.gt 4b
ret
5: ld1 {v4.S}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2
umull v18.8H, v4.8B, v30.8B
5: ld1 {v4.s}[0], [x1], x2
ld1 {v4.s}[1], [x1], x2
umull v18.8h, v4.8b, v30.8b
subs w3, w3, #2
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
rshrn v16.8b, v18.8h, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
add v18.8h, v18.8h, v22.8h
shrn v16.8b, v18.8h, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
ld1 {v20.s}[0], [x8], x2
ld1 {v20.s}[1], [x8], x2
urhadd v16.8b, v16.8b, v20.8b
.endif
prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
st1 {v16.s}[0], [x0], x2
st1 {v16.s}[1], [x0], x2
b.gt 5b
ret
endfunc
@ -372,51 +372,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
sub w4, w7, w13
sub w4, w4, w14
add w4, w4, #64
dup v0.8B, w4
dup v2.8B, w12
dup v1.8B, w6
dup v3.8B, w7
trn1 v0.4H, v0.4H, v2.4H
trn1 v1.4H, v1.4H, v3.4H
dup v0.8b, w4
dup v2.8b, w12
dup v1.8b, w6
dup v3.8b, w7
trn1 v0.4h, v0.4h, v2.4h
trn1 v1.4h, v1.4h, v3.4h
1:
ld1 {v4.S}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2
rev64 v5.2S, v4.2S
ld1 {v5.S}[1], [x1]
ext v6.8B, v4.8B, v5.8B, #1
ext v7.8B, v5.8B, v4.8B, #1
trn1 v4.4H, v4.4H, v6.4H
trn1 v5.4H, v5.4H, v7.4H
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
ld1 {v4.s}[0], [x1], x2
ld1 {v4.s}[1], [x1], x2
rev64 v5.2s, v4.2s
ld1 {v5.s}[1], [x1]
ext v6.8b, v4.8b, v5.8b, #1
ext v7.8b, v5.8b, v4.8b, #1
trn1 v4.4h, v4.4h, v6.4h
trn1 v5.4h, v5.4h, v7.4h
umull v16.8h, v4.8b, v0.8b
umlal v16.8h, v5.8b, v1.8b
.ifc \type,avg
ld1 {v18.H}[0], [x0], x2
ld1 {v18.H}[2], [x0]
ld1 {v18.h}[0], [x0], x2
ld1 {v18.h}[2], [x0]
sub x0, x0, x2
.endif
rev64 v17.4S, v16.4S
add v16.8H, v16.8H, v17.8H
rshrn v16.8B, v16.8H, #6
rev64 v17.4s, v16.4s
add v16.8h, v16.8h, v17.8h
rshrn v16.8b, v16.8h, #6
.ifc \type,avg
urhadd v16.8B, v16.8B, v18.8B
urhadd v16.8b, v16.8b, v18.8b
.endif
st1 {v16.H}[0], [x0], x2
st1 {v16.H}[2], [x0], x2
st1 {v16.h}[0], [x0], x2
st1 {v16.h}[2], [x0], x2
subs w3, w3, #2
b.gt 1b
ret
2:
ld1 {v16.H}[0], [x1], x2
ld1 {v16.H}[1], [x1], x2
ld1 {v16.h}[0], [x1], x2
ld1 {v16.h}[1], [x1], x2
.ifc \type,avg
ld1 {v18.H}[0], [x0], x2
ld1 {v18.H}[1], [x0]
ld1 {v18.h}[0], [x0], x2
ld1 {v18.h}[1], [x0]
sub x0, x0, x2
urhadd v16.8B, v16.8B, v18.8B
urhadd v16.8b, v16.8b, v18.8b
.endif
st1 {v16.H}[0], [x0], x2
st1 {v16.H}[1], [x0], x2
st1 {v16.h}[0], [x0], x2
st1 {v16.h}[1], [x0], x2
subs w3, w3, #2
b.gt 2b
ret

View File

@ -27,7 +27,7 @@
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.S[0], w6
mov v24.s[0], w6
and w8, w6, w6, lsl #16
b.eq 1f
ands w8, w8, w8, lsl #8
@ -38,95 +38,95 @@
.endm
.macro h264_loop_filter_luma
dup v22.16B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
uxtl v24.4S, v24.4H
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
sli v24.8H, v24.8H, #8
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
sli v24.4S, v24.4S, #16
cmhi v21.16B, v22.16B, v21.16B // < alpha
dup v22.16B, w3 // beta
cmlt v23.16B, v24.16B, #0
cmhi v28.16B, v22.16B, v28.16B // < beta
cmhi v30.16B, v22.16B, v30.16B // < beta
bic v21.16B, v21.16B, v23.16B
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
and v21.16B, v21.16B, v30.16B // < beta
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v24.4s, v24.4h
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
sli v24.8h, v24.8h, #8
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
sli v24.4s, v24.4s, #16
cmhi v21.16b, v22.16b, v21.16b // < alpha
dup v22.16b, w3 // beta
cmlt v23.16b, v24.16b, #0
cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16b, v22.16b, v30.16b // < beta
bic v21.16b, v21.16b, v23.16b
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
and v21.16b, v21.16b, v28.16b
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
and v21.16b, v21.16b, v30.16b // < beta
shrn v30.8b, v21.8h, #4
mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta
cmhi v19.16B, v22.16B, v19.16B // < beta
cmhi v17.16b, v22.16b, v17.16b // < beta
cmhi v19.16b, v22.16b, v19.16b // < beta
cbz x7, 9f
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
urhadd v28.16B, v16.16B, v0.16B
sub v21.16B, v24.16B, v17.16B
uqadd v23.16B, v18.16B, v24.16B
uhadd v20.16B, v20.16B, v28.16B
sub v21.16B, v21.16B, v19.16B
uhadd v28.16B, v4.16B, v28.16B
umin v23.16B, v23.16B, v20.16B
uqsub v22.16B, v18.16B, v24.16B
uqadd v4.16B, v2.16B, v24.16B
umax v23.16B, v23.16B, v22.16B
uqsub v22.16B, v2.16B, v24.16B
umin v28.16B, v4.16B, v28.16B
uxtl v4.8H, v0.8B
umax v28.16B, v28.16B, v22.16B
uxtl2 v20.8H, v0.16B
usubw v4.8H, v4.8H, v16.8B
usubw2 v20.8H, v20.8H, v16.16B
shl v4.8H, v4.8H, #2
shl v20.8H, v20.8H, #2
uaddw v4.8H, v4.8H, v18.8B
uaddw2 v20.8H, v20.8H, v18.16B
usubw v4.8H, v4.8H, v2.8B
usubw2 v20.8H, v20.8H, v2.16B
rshrn v4.8B, v4.8H, #3
rshrn2 v4.16B, v20.8H, #3
bsl v17.16B, v23.16B, v18.16B
bsl v19.16B, v28.16B, v2.16B
neg v23.16B, v21.16B
uxtl v28.8H, v16.8B
smin v4.16B, v4.16B, v21.16B
uxtl2 v21.8H, v16.16B
smax v4.16B, v4.16B, v23.16B
uxtl v22.8H, v0.8B
uxtl2 v24.8H, v0.16B
saddw v28.8H, v28.8H, v4.8B
saddw2 v21.8H, v21.8H, v4.16B
ssubw v22.8H, v22.8H, v4.8B
ssubw2 v24.8H, v24.8H, v4.16B
sqxtun v16.8B, v28.8H
sqxtun2 v16.16B, v21.8H
sqxtun v0.8B, v22.8H
sqxtun2 v0.16B, v24.8H
and v17.16b, v17.16b, v21.16b
and v19.16b, v19.16b, v21.16b
and v24.16b, v24.16b, v21.16b
urhadd v28.16b, v16.16b, v0.16b
sub v21.16b, v24.16b, v17.16b
uqadd v23.16b, v18.16b, v24.16b
uhadd v20.16b, v20.16b, v28.16b
sub v21.16b, v21.16b, v19.16b
uhadd v28.16b, v4.16b, v28.16b
umin v23.16b, v23.16b, v20.16b
uqsub v22.16b, v18.16b, v24.16b
uqadd v4.16b, v2.16b, v24.16b
umax v23.16b, v23.16b, v22.16b
uqsub v22.16b, v2.16b, v24.16b
umin v28.16b, v4.16b, v28.16b
uxtl v4.8h, v0.8b
umax v28.16b, v28.16b, v22.16b
uxtl2 v20.8h, v0.16b
usubw v4.8h, v4.8h, v16.8b
usubw2 v20.8h, v20.8h, v16.16b
shl v4.8h, v4.8h, #2
shl v20.8h, v20.8h, #2
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v20.8h, v20.8h, v18.16b
usubw v4.8h, v4.8h, v2.8b
usubw2 v20.8h, v20.8h, v2.16b
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v20.8h, #3
bsl v17.16b, v23.16b, v18.16b
bsl v19.16b, v28.16b, v2.16b
neg v23.16b, v21.16b
uxtl v28.8h, v16.8b
smin v4.16b, v4.16b, v21.16b
uxtl2 v21.8h, v16.16b
smax v4.16b, v4.16b, v23.16b
uxtl v22.8h, v0.8b
uxtl2 v24.8h, v0.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v21.8h, v21.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v24.8h, v24.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun2 v16.16b, v21.8h
sqxtun v0.8b, v22.8h
sqxtun2 v0.16b, v24.8h
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
ld1 {v0.16B}, [x0], x1
ld1 {v2.16B}, [x0], x1
ld1 {v4.16B}, [x0], x1
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0], x1
ld1 {v4.16b}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16B}, [x0], x1
ld1 {v18.16B}, [x0], x1
ld1 {v16.16B}, [x0], x1
ld1 {v20.16b}, [x0], x1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16B}, [x0], x1
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
st1 {v17.16b}, [x0], x1
st1 {v16.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
st1 {v19.16b}, [x0]
9:
ret
endfunc
@ -135,22 +135,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8B}, [x0], x1
ld1 {v20.8B}, [x0], x1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0], x1
ld1 {v4.8B}, [x0], x1
ld1 {v26.8B}, [x0], x1
ld1 {v6.D}[1], [x0], x1
ld1 {v20.D}[1], [x0], x1
ld1 {v18.D}[1], [x0], x1
ld1 {v16.D}[1], [x0], x1
ld1 {v0.D}[1], [x0], x1
ld1 {v2.D}[1], [x0], x1
ld1 {v4.D}[1], [x0], x1
ld1 {v26.D}[1], [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v20.8b}, [x0], x1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v4.8b}, [x0], x1
ld1 {v26.8b}, [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v18.d}[1], [x0], x1
ld1 {v16.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v26.d}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
@ -160,22 +160,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v19.S}[0], [x0], x1
st1 {v17.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v19.S}[1], [x0], x1
st1 {v17.S}[2], [x0], x1
st1 {v16.S}[2], [x0], x1
st1 {v0.S}[2], [x0], x1
st1 {v19.S}[2], [x0], x1
st1 {v17.S}[3], [x0], x1
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
st1 {v17.s}[0], [x0], x1
st1 {v16.s}[0], [x0], x1
st1 {v0.s}[0], [x0], x1
st1 {v19.s}[0], [x0], x1
st1 {v17.s}[1], [x0], x1
st1 {v16.s}[1], [x0], x1
st1 {v0.s}[1], [x0], x1
st1 {v19.s}[1], [x0], x1
st1 {v17.s}[2], [x0], x1
st1 {v16.s}[2], [x0], x1
st1 {v0.s}[2], [x0], x1
st1 {v19.s}[2], [x0], x1
st1 {v17.s}[3], [x0], x1
st1 {v16.s}[3], [x0], x1
st1 {v0.s}[3], [x0], x1
st1 {v19.s}[3], [x0], x1
9:
ret
endfunc
@ -377,52 +377,52 @@ function ff_h264_h_loop_filter_luma_intra_neon, export=1
endfunc
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
dup v23.8B, w3 // beta
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
cmhi v26.8B, v22.8B, v26.8B // < alpha
cmhi v28.8B, v23.8B, v28.8B // < beta
cmhi v30.8B, v23.8B, v30.8B // < beta
uxtl v4.8H, v0.8B
and v26.8B, v26.8B, v28.8B
usubw v4.8H, v4.8H, v16.8B
and v26.8B, v26.8B, v30.8B
shl v4.8H, v4.8H, #2
dup v22.8b, w2 // alpha
dup v23.8b, w3 // beta
uxtl v24.8h, v24.8b
uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0)
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0)
cmhi v26.8b, v22.8b, v26.8b // < alpha
cmhi v28.8b, v23.8b, v28.8b // < beta
cmhi v30.8b, v23.8b, v30.8b // < beta
uxtl v4.8h, v0.8b
and v26.8b, v26.8b, v28.8b
usubw v4.8h, v4.8h, v16.8b
and v26.8b, v26.8b, v30.8b
shl v4.8h, v4.8h, #2
mov x8, v26.d[0]
sli v24.8H, v24.8H, #8
uaddw v4.8H, v4.8H, v18.8B
sli v24.8h, v24.8h, #8
uaddw v4.8h, v4.8h, v18.8b
cbz x8, 9f
usubw v4.8H, v4.8H, v2.8B
rshrn v4.8B, v4.8H, #3
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
smax v4.8B, v4.8B, v25.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
saddw v28.8H, v28.8H, v4.8B
ssubw v22.8H, v22.8H, v4.8B
sqxtun v16.8B, v28.8H
sqxtun v0.8B, v22.8H
usubw v4.8h, v4.8h, v2.8b
rshrn v4.8b, v4.8h, #3
smin v4.8b, v4.8b, v24.8b
neg v25.8b, v24.8b
smax v4.8b, v4.8b, v25.8b
uxtl v22.8h, v0.8b
and v4.8b, v4.8b, v26.8b
uxtl v28.8h, v16.8b
saddw v28.8h, v28.8h, v4.8b
ssubw v22.8h, v22.8h, v4.8b
sqxtun v16.8b, v28.8h
sqxtun v0.8b, v22.8h
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0]
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v2.8b}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
st1 {v16.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
9:
ret
endfunc
@ -432,14 +432,14 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
sub x0, x0, #2
h_loop_filter_chroma420:
ld1 {v18.S}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1
ld1 {v2.S}[0], [x0], x1
ld1 {v18.S}[1], [x0], x1
ld1 {v16.S}[1], [x0], x1
ld1 {v0.S}[1], [x0], x1
ld1 {v2.S}[1], [x0], x1
ld1 {v18.s}[0], [x0], x1
ld1 {v16.s}[0], [x0], x1
ld1 {v0.s}[0], [x0], x1
ld1 {v2.s}[0], [x0], x1
ld1 {v18.s}[1], [x0], x1
ld1 {v16.s}[1], [x0], x1
ld1 {v0.s}[1], [x0], x1
ld1 {v2.s}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
@ -448,14 +448,14 @@ h_loop_filter_chroma420:
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v2.S}[0], [x0], x1
st1 {v18.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
st1 {v18.s}[0], [x0], x1
st1 {v16.s}[0], [x0], x1
st1 {v0.s}[0], [x0], x1
st1 {v2.s}[0], [x0], x1
st1 {v18.s}[1], [x0], x1
st1 {v16.s}[1], [x0], x1
st1 {v0.s}[1], [x0], x1
st1 {v2.s}[1], [x0], x1
9:
ret
endfunc
@ -584,102 +584,102 @@ function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
endfunc
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6
mov v4.16B, v16.16B
mov v6.16B, v16.16B
dup v0.16b, w5
dup v1.16b, w6
mov v4.16b, v16.16b
mov v6.16b, v16.16b
1: subs w3, w3, #2
ld1 {v20.16B}, [x0], x2
\macd v4.8H, v0.8B, v20.8B
ld1 {v20.16b}, [x0], x2
\macd v4.8h, v0.8b, v20.8b
\macd\()2 v6.8H, v0.16B, v20.16B
ld1 {v22.16B}, [x1], x2
\macs v4.8H, v1.8B, v22.8B
ld1 {v22.16b}, [x1], x2
\macs v4.8h, v1.8b, v22.8b
\macs\()2 v6.8H, v1.16B, v22.16B
mov v24.16B, v16.16B
ld1 {v28.16B}, [x0], x2
mov v26.16B, v16.16B
\macd v24.8H, v0.8B, v28.8B
mov v24.16b, v16.16b
ld1 {v28.16b}, [x0], x2
mov v26.16b, v16.16b
\macd v24.8h, v0.8b, v28.8b
\macd\()2 v26.8H, v0.16B, v28.16B
ld1 {v30.16B}, [x1], x2
\macs v24.8H, v1.8B, v30.8B
ld1 {v30.16b}, [x1], x2
\macs v24.8h, v1.8b, v30.8b
\macs\()2 v26.8H, v1.16B, v30.16B
sshl v4.8H, v4.8H, v18.8H
sshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
sshl v24.8H, v24.8H, v18.8H
sshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
mov v6.16B, v16.16B
st1 {v4.16B}, [x7], x2
mov v4.16B, v16.16B
st1 {v24.16B}, [x7], x2
sshl v4.8h, v4.8h, v18.8h
sshl v6.8h, v6.8h, v18.8h
sqxtun v4.8b, v4.8h
sqxtun2 v4.16b, v6.8h
sshl v24.8h, v24.8h, v18.8h
sshl v26.8h, v26.8h, v18.8h
sqxtun v24.8b, v24.8h
sqxtun2 v24.16b, v26.8h
mov v6.16b, v16.16b
st1 {v4.16b}, [x7], x2
mov v4.16b, v16.16b
st1 {v24.16b}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_8 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B, v16.16B
dup v0.8b, w5
dup v1.8b, w6
mov v2.16b, v16.16b
mov v20.16b, v16.16b
1: subs w3, w3, #2
ld1 {v4.8B}, [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.8B}, [x1], x2
\macs v2.8H, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.8B}, [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.8B}, [x7], x2
mov v2.16B, v16.16B
st1 {v4.8B}, [x7], x2
ld1 {v4.8b}, [x0], x2
\macd v2.8h, v0.8b, v4.8b
ld1 {v5.8b}, [x1], x2
\macs v2.8h, v1.8b, v5.8b
ld1 {v6.8b}, [x0], x2
\macd v20.8h, v0.8b, v6.8b
ld1 {v7.8b}, [x1], x2
\macs v20.8h, v1.8b, v7.8b
sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
sshl v20.8h, v20.8h, v18.8h
sqxtun v4.8b, v20.8h
mov v20.16b, v16.16b
st1 {v2.8b}, [x7], x2
mov v2.16b, v16.16b
st1 {v4.8b}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_4 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B,v16.16B
dup v0.8b, w5
dup v1.8b, w6
mov v2.16b, v16.16b
mov v20.16b,v16.16b
1: subs w3, w3, #4
ld1 {v4.S}[0], [x0], x2
ld1 {v4.S}[1], [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.S}[0], [x1], x2
ld1 {v5.S}[1], [x1], x2
\macs v2.8H, v1.8B, v5.8B
ld1 {v4.s}[0], [x0], x2
ld1 {v4.s}[1], [x0], x2
\macd v2.8h, v0.8b, v4.8b
ld1 {v5.s}[0], [x1], x2
ld1 {v5.s}[1], [x1], x2
\macs v2.8h, v1.8b, v5.8b
b.lt 2f
ld1 {v6.S}[0], [x0], x2
ld1 {v6.S}[1], [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.S}[0], [x1], x2
ld1 {v7.S}[1], [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
mov v2.16B, v16.16B
st1 {v4.S}[0], [x7], x2
st1 {v4.S}[1], [x7], x2
ld1 {v6.s}[0], [x0], x2
ld1 {v6.s}[1], [x0], x2
\macd v20.8h, v0.8b, v6.8b
ld1 {v7.s}[0], [x1], x2
ld1 {v7.s}[1], [x1], x2
\macs v20.8h, v1.8b, v7.8b
sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
sshl v20.8h, v20.8h, v18.8h
sqxtun v4.8b, v20.8h
mov v20.16b, v16.16b
st1 {v2.s}[0], [x7], x2
st1 {v2.s}[1], [x7], x2
mov v2.16b, v16.16b
st1 {v4.s}[0], [x7], x2
st1 {v4.s}[1], [x7], x2
b.ne 1b
ret
2: sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
2: sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
st1 {v2.s}[0], [x7], x2
st1 {v2.s}[1], [x7], x2
ret
.endm
@ -689,10 +689,10 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
dup v18.8H, w4
dup v18.8h, w4
lsl w7, w7, w4
not v18.16B, v18.16B
dup v16.8H, w7
not v18.16b, v18.16b
dup v16.8h, w7
mov x7, x0
cbz w8, 10f
subs w8, w8, #1
@ -716,78 +716,78 @@ endfunc
biweight_func 4
.macro weight_16 add
dup v0.16B, w4
dup v0.16b, w4
1: subs w2, w2, #2
ld1 {v20.16B}, [x0], x1
umull v4.8H, v0.8B, v20.8B
umull2 v6.8H, v0.16B, v20.16B
ld1 {v28.16B}, [x0], x1
umull v24.8H, v0.8B, v28.8B
umull2 v26.8H, v0.16B, v28.16B
\add v4.8H, v16.8H, v4.8H
srshl v4.8H, v4.8H, v18.8H
\add v6.8H, v16.8H, v6.8H
srshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
\add v24.8H, v16.8H, v24.8H
srshl v24.8H, v24.8H, v18.8H
\add v26.8H, v16.8H, v26.8H
srshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
st1 {v4.16B}, [x5], x1
st1 {v24.16B}, [x5], x1
ld1 {v20.16b}, [x0], x1
umull v4.8h, v0.8b, v20.8b
umull2 v6.8h, v0.16b, v20.16b
ld1 {v28.16b}, [x0], x1
umull v24.8h, v0.8b, v28.8b
umull2 v26.8h, v0.16b, v28.16b
\add v4.8h, v16.8h, v4.8h
srshl v4.8h, v4.8h, v18.8h
\add v6.8h, v16.8h, v6.8h
srshl v6.8h, v6.8h, v18.8h
sqxtun v4.8b, v4.8h
sqxtun2 v4.16b, v6.8h
\add v24.8h, v16.8h, v24.8h
srshl v24.8h, v24.8h, v18.8h
\add v26.8h, v16.8h, v26.8h
srshl v26.8h, v26.8h, v18.8h
sqxtun v24.8b, v24.8h
sqxtun2 v24.16b, v26.8h
st1 {v4.16b}, [x5], x1
st1 {v24.16b}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_8 add
dup v0.8B, w4
dup v0.8b, w4
1: subs w2, w2, #2
ld1 {v4.8B}, [x0], x1
umull v2.8H, v0.8B, v4.8B
ld1 {v6.8B}, [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.8B}, [x5], x1
st1 {v4.8B}, [x5], x1
ld1 {v4.8b}, [x0], x1
umull v2.8h, v0.8b, v4.8b
ld1 {v6.8b}, [x0], x1
umull v20.8h, v0.8b, v6.8b
\add v2.8h, v16.8h, v2.8h
srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
\add v20.8h, v16.8h, v20.8h
srshl v20.8h, v20.8h, v18.8h
sqxtun v4.8b, v20.8h
st1 {v2.8b}, [x5], x1
st1 {v4.8b}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_4 add
dup v0.8B, w4
dup v0.8b, w4
1: subs w2, w2, #4
ld1 {v4.S}[0], [x0], x1
ld1 {v4.S}[1], [x0], x1
umull v2.8H, v0.8B, v4.8B
ld1 {v4.s}[0], [x0], x1
ld1 {v4.s}[1], [x0], x1
umull v2.8h, v0.8b, v4.8b
b.lt 2f
ld1 {v6.S}[0], [x0], x1
ld1 {v6.S}[1], [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8h, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
st1 {v4.S}[0], [x5], x1
st1 {v4.S}[1], [x5], x1
ld1 {v6.s}[0], [x0], x1
ld1 {v6.s}[1], [x0], x1
umull v20.8h, v0.8b, v6.8b
\add v2.8h, v16.8h, v2.8h
srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
\add v20.8h, v16.8h, v20.8h
srshl v20.8h, v20.8h, v18.8h
sqxtun v4.8b, v20.8h
st1 {v2.s}[0], [x5], x1
st1 {v2.s}[1], [x5], x1
st1 {v4.s}[0], [x5], x1
st1 {v4.s}[1], [x5], x1
b.ne 1b
ret
2: \add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
2: \add v2.8h, v16.8h, v2.8h
srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8b, v2.8h
st1 {v2.s}[0], [x5], x1
st1 {v2.s}[1], [x5], x1
ret
.endm
@ -796,18 +796,18 @@ function ff_weight_h264_pixels_\w\()_neon, export=1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
dup v16.8H, w5
dup v16.8h, w5
mov x5, x0
b.le 20f
sub w6, w6, w3
dup v18.8H, w6
dup v18.8h, w6
cmp w4, #0
b.lt 10f
weight_\w shadd
10: neg w4, w4
weight_\w shsub
20: neg w6, w3
dup v18.8H, w6
dup v18.8h, w6
cmp w4, #0
b.lt 10f
weight_\w add
@ -825,7 +825,7 @@ endfunc
ldr w6, [x4]
ccmp w3, #0, #0, ne
lsl w2, w2, #2
mov v24.S[0], w6
mov v24.s[0], w6
lsl w3, w3, #2
and w8, w6, w6, lsl #16
b.eq 1f

View File

@ -25,54 +25,54 @@
function ff_h264_idct_add_neon, export=1
.L_ff_h264_idct_add_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1]
sxtw x2, w2
movi v30.8H, #0
movi v30.8h, #0
add v4.4H, v0.4H, v2.4H
sshr v16.4H, v1.4H, #1
st1 {v30.8H}, [x1], #16
sshr v17.4H, v3.4H, #1
st1 {v30.8H}, [x1], #16
sub v5.4H, v0.4H, v2.4H
sub v6.4H, v16.4H, v3.4H
add v7.4H, v1.4H, v17.4H
add v0.4H, v4.4H, v7.4H
add v1.4H, v5.4H, v6.4H
sub v2.4H, v5.4H, v6.4H
sub v3.4H, v4.4H, v7.4H
add v4.4h, v0.4h, v2.4h
sshr v16.4h, v1.4h, #1
st1 {v30.8h}, [x1], #16
sshr v17.4h, v3.4h, #1
st1 {v30.8h}, [x1], #16
sub v5.4h, v0.4h, v2.4h
sub v6.4h, v16.4h, v3.4h
add v7.4h, v1.4h, v17.4h
add v0.4h, v4.4h, v7.4h
add v1.4h, v5.4h, v6.4h
sub v2.4h, v5.4h, v6.4h
sub v3.4h, v4.4h, v7.4h
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v4.4H, v0.4H, v2.4H
ld1 {v18.S}[0], [x0], x2
sshr v16.4H, v3.4H, #1
sshr v17.4H, v1.4H, #1
ld1 {v18.S}[1], [x0], x2
sub v5.4H, v0.4H, v2.4H
ld1 {v19.S}[1], [x0], x2
add v6.4H, v16.4H, v1.4H
ins v4.D[1], v5.D[0]
sub v7.4H, v17.4H, v3.4H
ld1 {v19.S}[0], [x0], x2
ins v6.D[1], v7.D[0]
add v4.4h, v0.4h, v2.4h
ld1 {v18.s}[0], [x0], x2
sshr v16.4h, v3.4h, #1
sshr v17.4h, v1.4h, #1
ld1 {v18.s}[1], [x0], x2
sub v5.4h, v0.4h, v2.4h
ld1 {v19.s}[1], [x0], x2
add v6.4h, v16.4h, v1.4h
ins v4.d[1], v5.d[0]
sub v7.4h, v17.4h, v3.4h
ld1 {v19.s}[0], [x0], x2
ins v6.d[1], v7.d[0]
sub x0, x0, x2, lsl #2
add v0.8H, v4.8H, v6.8H
sub v1.8H, v4.8H, v6.8H
add v0.8h, v4.8h, v6.8h
sub v1.8h, v4.8h, v6.8h
srshr v0.8H, v0.8H, #6
srshr v1.8H, v1.8H, #6
srshr v0.8h, v0.8h, #6
srshr v1.8h, v1.8h, #6
uaddw v0.8H, v0.8H, v18.8B
uaddw v1.8H, v1.8H, v19.8B
uaddw v0.8h, v0.8h, v18.8b
uaddw v1.8h, v1.8h, v19.8b
sqxtun v0.8B, v0.8H
sqxtun v1.8B, v1.8H
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
st1 {v0.S}[0], [x0], x2
st1 {v0.S}[1], [x0], x2
st1 {v1.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
st1 {v0.s}[0], [x0], x2
st1 {v0.s}[1], [x0], x2
st1 {v1.s}[1], [x0], x2
st1 {v1.s}[0], [x0], x2
sub x1, x1, #32
ret
@ -83,22 +83,22 @@ function ff_h264_idct_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET
sxtw x2, w2
mov w3, #0
ld1r {v2.8H}, [x1]
ld1r {v2.8h}, [x1]
strh w3, [x1]
srshr v2.8H, v2.8H, #6
ld1 {v0.S}[0], [x0], x2
ld1 {v0.S}[1], [x0], x2
uaddw v3.8H, v2.8H, v0.8B
ld1 {v1.S}[0], [x0], x2
ld1 {v1.S}[1], [x0], x2
uaddw v4.8H, v2.8H, v1.8B
sqxtun v0.8B, v3.8H
sqxtun v1.8B, v4.8H
srshr v2.8h, v2.8h, #6
ld1 {v0.s}[0], [x0], x2
ld1 {v0.s}[1], [x0], x2
uaddw v3.8h, v2.8h, v0.8b
ld1 {v1.s}[0], [x0], x2
ld1 {v1.s}[1], [x0], x2
uaddw v4.8h, v2.8h, v1.8b
sqxtun v0.8b, v3.8h
sqxtun v1.8b, v4.8h
sub x0, x0, x2, lsl #2
st1 {v0.S}[0], [x0], x2
st1 {v0.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
st1 {v1.S}[1], [x0], x2
st1 {v0.s}[0], [x0], x2
st1 {v0.s}[1], [x0], x2
st1 {v1.s}[0], [x0], x2
st1 {v1.s}[1], [x0], x2
ret
endfunc
@ -194,71 +194,71 @@ endfunc
.if \pass == 0
va .req v18
vb .req v30
sshr v18.8H, v26.8H, #1
add v16.8H, v24.8H, v28.8H
ld1 {v30.8H, v31.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
sub v17.8H, v24.8H, v28.8H
sshr v19.8H, v30.8H, #1
sub v18.8H, v18.8H, v30.8H
add v19.8H, v19.8H, v26.8H
sshr v18.8h, v26.8h, #1
add v16.8h, v24.8h, v28.8h
ld1 {v30.8h, v31.8h}, [x1]
st1 {v19.8h}, [x1], #16
st1 {v19.8h}, [x1], #16
sub v17.8h, v24.8h, v28.8h
sshr v19.8h, v30.8h, #1
sub v18.8h, v18.8h, v30.8h
add v19.8h, v19.8h, v26.8h
.else
va .req v30
vb .req v18
sshr v30.8H, v26.8H, #1
sshr v19.8H, v18.8H, #1
add v16.8H, v24.8H, v28.8H
sub v17.8H, v24.8H, v28.8H
sub v30.8H, v30.8H, v18.8H
add v19.8H, v19.8H, v26.8H
sshr v30.8h, v26.8h, #1
sshr v19.8h, v18.8h, #1
add v16.8h, v24.8h, v28.8h
sub v17.8h, v24.8h, v28.8h
sub v30.8h, v30.8h, v18.8h
add v19.8h, v19.8h, v26.8h
.endif
add v26.8H, v17.8H, va.8H
sub v28.8H, v17.8H, va.8H
add v24.8H, v16.8H, v19.8H
sub vb.8H, v16.8H, v19.8H
sub v16.8H, v29.8H, v27.8H
add v17.8H, v31.8H, v25.8H
sub va.8H, v31.8H, v25.8H
add v19.8H, v29.8H, v27.8H
sub v16.8H, v16.8H, v31.8H
sub v17.8H, v17.8H, v27.8H
add va.8H, va.8H, v29.8H
add v19.8H, v19.8H, v25.8H
sshr v25.8H, v25.8H, #1
sshr v27.8H, v27.8H, #1
sshr v29.8H, v29.8H, #1
sshr v31.8H, v31.8H, #1
sub v16.8H, v16.8H, v31.8H
sub v17.8H, v17.8H, v27.8H
add va.8H, va.8H, v29.8H
add v19.8H, v19.8H, v25.8H
sshr v25.8H, v16.8H, #2
sshr v27.8H, v17.8H, #2
sshr v29.8H, va.8H, #2
sshr v31.8H, v19.8H, #2
sub v19.8H, v19.8H, v25.8H
sub va.8H, v27.8H, va.8H
add v17.8H, v17.8H, v29.8H
add v16.8H, v16.8H, v31.8H
add v26.8h, v17.8h, va.8h
sub v28.8h, v17.8h, va.8h
add v24.8h, v16.8h, v19.8h
sub vb.8h, v16.8h, v19.8h
sub v16.8h, v29.8h, v27.8h
add v17.8h, v31.8h, v25.8h
sub va.8h, v31.8h, v25.8h
add v19.8h, v29.8h, v27.8h
sub v16.8h, v16.8h, v31.8h
sub v17.8h, v17.8h, v27.8h
add va.8h, va.8h, v29.8h
add v19.8h, v19.8h, v25.8h
sshr v25.8h, v25.8h, #1
sshr v27.8h, v27.8h, #1
sshr v29.8h, v29.8h, #1
sshr v31.8h, v31.8h, #1
sub v16.8h, v16.8h, v31.8h
sub v17.8h, v17.8h, v27.8h
add va.8h, va.8h, v29.8h
add v19.8h, v19.8h, v25.8h
sshr v25.8h, v16.8h, #2
sshr v27.8h, v17.8h, #2
sshr v29.8h, va.8h, #2
sshr v31.8h, v19.8h, #2
sub v19.8h, v19.8h, v25.8h
sub va.8h, v27.8h, va.8h
add v17.8h, v17.8h, v29.8h
add v16.8h, v16.8h, v31.8h
.if \pass == 0
sub v31.8H, v24.8H, v19.8H
add v24.8H, v24.8H, v19.8H
add v25.8H, v26.8H, v18.8H
sub v18.8H, v26.8H, v18.8H
add v26.8H, v28.8H, v17.8H
add v27.8H, v30.8H, v16.8H
sub v29.8H, v28.8H, v17.8H
sub v28.8H, v30.8H, v16.8H
sub v31.8h, v24.8h, v19.8h
add v24.8h, v24.8h, v19.8h
add v25.8h, v26.8h, v18.8h
sub v18.8h, v26.8h, v18.8h
add v26.8h, v28.8h, v17.8h
add v27.8h, v30.8h, v16.8h
sub v29.8h, v28.8h, v17.8h
sub v28.8h, v30.8h, v16.8h
.else
sub v31.8H, v24.8H, v19.8H
add v24.8H, v24.8H, v19.8H
add v25.8H, v26.8H, v30.8H
sub v30.8H, v26.8H, v30.8H
add v26.8H, v28.8H, v17.8H
sub v29.8H, v28.8H, v17.8H
add v27.8H, v18.8H, v16.8H
sub v28.8H, v18.8H, v16.8H
sub v31.8h, v24.8h, v19.8h
add v24.8h, v24.8h, v19.8h
add v25.8h, v26.8h, v30.8h
sub v30.8h, v26.8h, v30.8h
add v26.8h, v28.8h, v17.8h
sub v29.8h, v28.8h, v17.8h
add v27.8h, v18.8h, v16.8h
sub v28.8h, v18.8h, v16.8h
.endif
.unreq va
.unreq vb
@ -267,63 +267,63 @@ endfunc
function ff_h264_idct8_add_neon, export=1
.L_ff_h264_idct8_add_neon:
AARCH64_VALID_CALL_TARGET
movi v19.8H, #0
movi v19.8h, #0
sxtw x2, w2
ld1 {v24.8H, v25.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
ld1 {v26.8H, v27.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
ld1 {v28.8H, v29.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
ld1 {v24.8h, v25.8h}, [x1]
st1 {v19.8h}, [x1], #16
st1 {v19.8h}, [x1], #16
ld1 {v26.8h, v27.8h}, [x1]
st1 {v19.8h}, [x1], #16
st1 {v19.8h}, [x1], #16
ld1 {v28.8h, v29.8h}, [x1]
st1 {v19.8h}, [x1], #16
st1 {v19.8h}, [x1], #16
idct8x8_cols 0
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
idct8x8_cols 1
mov x3, x0
srshr v24.8H, v24.8H, #6
ld1 {v0.8B}, [x0], x2
srshr v25.8H, v25.8H, #6
ld1 {v1.8B}, [x0], x2
srshr v26.8H, v26.8H, #6
ld1 {v2.8B}, [x0], x2
srshr v27.8H, v27.8H, #6
ld1 {v3.8B}, [x0], x2
srshr v28.8H, v28.8H, #6
ld1 {v4.8B}, [x0], x2
srshr v29.8H, v29.8H, #6
ld1 {v5.8B}, [x0], x2
srshr v30.8H, v30.8H, #6
ld1 {v6.8B}, [x0], x2
srshr v31.8H, v31.8H, #6
ld1 {v7.8B}, [x0], x2
uaddw v24.8H, v24.8H, v0.8B
uaddw v25.8H, v25.8H, v1.8B
uaddw v26.8H, v26.8H, v2.8B
sqxtun v0.8B, v24.8H
uaddw v27.8H, v27.8H, v3.8B
sqxtun v1.8B, v25.8H
uaddw v28.8H, v28.8H, v4.8B
sqxtun v2.8B, v26.8H
st1 {v0.8B}, [x3], x2
uaddw v29.8H, v29.8H, v5.8B
sqxtun v3.8B, v27.8H
st1 {v1.8B}, [x3], x2
uaddw v30.8H, v30.8H, v6.8B
sqxtun v4.8B, v28.8H
st1 {v2.8B}, [x3], x2
uaddw v31.8H, v31.8H, v7.8B
sqxtun v5.8B, v29.8H
st1 {v3.8B}, [x3], x2
sqxtun v6.8B, v30.8H
sqxtun v7.8B, v31.8H
st1 {v4.8B}, [x3], x2
st1 {v5.8B}, [x3], x2
st1 {v6.8B}, [x3], x2
st1 {v7.8B}, [x3], x2
srshr v24.8h, v24.8h, #6
ld1 {v0.8b}, [x0], x2
srshr v25.8h, v25.8h, #6
ld1 {v1.8b}, [x0], x2
srshr v26.8h, v26.8h, #6
ld1 {v2.8b}, [x0], x2
srshr v27.8h, v27.8h, #6
ld1 {v3.8b}, [x0], x2
srshr v28.8h, v28.8h, #6
ld1 {v4.8b}, [x0], x2
srshr v29.8h, v29.8h, #6
ld1 {v5.8b}, [x0], x2
srshr v30.8h, v30.8h, #6
ld1 {v6.8b}, [x0], x2
srshr v31.8h, v31.8h, #6
ld1 {v7.8b}, [x0], x2
uaddw v24.8h, v24.8h, v0.8b
uaddw v25.8h, v25.8h, v1.8b
uaddw v26.8h, v26.8h, v2.8b
sqxtun v0.8b, v24.8h
uaddw v27.8h, v27.8h, v3.8b
sqxtun v1.8b, v25.8h
uaddw v28.8h, v28.8h, v4.8b
sqxtun v2.8b, v26.8h
st1 {v0.8b}, [x3], x2
uaddw v29.8h, v29.8h, v5.8b
sqxtun v3.8b, v27.8h
st1 {v1.8b}, [x3], x2
uaddw v30.8h, v30.8h, v6.8b
sqxtun v4.8b, v28.8h
st1 {v2.8b}, [x3], x2
uaddw v31.8h, v31.8h, v7.8b
sqxtun v5.8b, v29.8h
st1 {v3.8b}, [x3], x2
sqxtun v6.8b, v30.8h
sqxtun v7.8b, v31.8h
st1 {v4.8b}, [x3], x2
st1 {v5.8b}, [x3], x2
st1 {v6.8b}, [x3], x2
st1 {v7.8b}, [x3], x2
sub x1, x1, #128
ret
@ -334,42 +334,42 @@ function ff_h264_idct8_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET
mov w3, #0
sxtw x2, w2
ld1r {v31.8H}, [x1]
ld1r {v31.8h}, [x1]
strh w3, [x1]
ld1 {v0.8B}, [x0], x2
srshr v31.8H, v31.8H, #6
ld1 {v1.8B}, [x0], x2
ld1 {v2.8B}, [x0], x2
uaddw v24.8H, v31.8H, v0.8B
ld1 {v3.8B}, [x0], x2
uaddw v25.8H, v31.8H, v1.8B
ld1 {v4.8B}, [x0], x2
uaddw v26.8H, v31.8H, v2.8B
ld1 {v5.8B}, [x0], x2
uaddw v27.8H, v31.8H, v3.8B
ld1 {v6.8B}, [x0], x2
uaddw v28.8H, v31.8H, v4.8B
ld1 {v7.8B}, [x0], x2
uaddw v29.8H, v31.8H, v5.8B
uaddw v30.8H, v31.8H, v6.8B
uaddw v31.8H, v31.8H, v7.8B
sqxtun v0.8B, v24.8H
sqxtun v1.8B, v25.8H
sqxtun v2.8B, v26.8H
sqxtun v3.8B, v27.8H
ld1 {v0.8b}, [x0], x2
srshr v31.8h, v31.8h, #6
ld1 {v1.8b}, [x0], x2
ld1 {v2.8b}, [x0], x2
uaddw v24.8h, v31.8h, v0.8b
ld1 {v3.8b}, [x0], x2
uaddw v25.8h, v31.8h, v1.8b
ld1 {v4.8b}, [x0], x2
uaddw v26.8h, v31.8h, v2.8b
ld1 {v5.8b}, [x0], x2
uaddw v27.8h, v31.8h, v3.8b
ld1 {v6.8b}, [x0], x2
uaddw v28.8h, v31.8h, v4.8b
ld1 {v7.8b}, [x0], x2
uaddw v29.8h, v31.8h, v5.8b
uaddw v30.8h, v31.8h, v6.8b
uaddw v31.8h, v31.8h, v7.8b
sqxtun v0.8b, v24.8h
sqxtun v1.8b, v25.8h
sqxtun v2.8b, v26.8h
sqxtun v3.8b, v27.8h
sub x0, x0, x2, lsl #3
st1 {v0.8B}, [x0], x2
sqxtun v4.8B, v28.8H
st1 {v1.8B}, [x0], x2
sqxtun v5.8B, v29.8H
st1 {v2.8B}, [x0], x2
sqxtun v6.8B, v30.8H
st1 {v3.8B}, [x0], x2
sqxtun v7.8B, v31.8H
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
st1 {v6.8B}, [x0], x2
st1 {v7.8B}, [x0], x2
st1 {v0.8b}, [x0], x2
sqxtun v4.8b, v28.8h
st1 {v1.8b}, [x0], x2
sqxtun v5.8b, v29.8h
st1 {v2.8b}, [x0], x2
sqxtun v6.8b, v30.8h
st1 {v3.8b}, [x0], x2
sqxtun v7.8b, v31.8h
st1 {v4.8b}, [x0], x2
st1 {v5.8b}, [x0], x2
st1 {v6.8b}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
endfunc

View File

@ -27,127 +27,127 @@
.macro lowpass_const r
movz \r, #20, lsl #16
movk \r, #5
mov v6.S[0], \r
mov v6.s[0], \r
.endm
//trashes v0-v5
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3
uaddl v2.8H, v2.8B, v3.8B
ext v4.8B, \r0\().8B, \r1\().8B, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4
uaddl v4.8H, v4.8B, v5.8B
ext v1.8B, \r0\().8B, \r1\().8B, #5
uaddl \d0\().8H, \r0\().8B, v1.8B
ext v0.8B, \r2\().8B, \r3\().8B, #2
mla \d0\().8H, v2.8H, v6.H[1]
ext v1.8B, \r2\().8B, \r3\().8B, #3
uaddl v0.8H, v0.8B, v1.8B
ext v1.8B, \r2\().8B, \r3\().8B, #1
mls \d0\().8H, v4.8H, v6.H[0]
ext v3.8B, \r2\().8B, \r3\().8B, #4
uaddl v1.8H, v1.8B, v3.8B
ext v2.8B, \r2\().8B, \r3\().8B, #5
uaddl \d1\().8H, \r2\().8B, v2.8B
mla \d1\().8H, v0.8H, v6.H[1]
mls \d1\().8H, v1.8H, v6.H[0]
ext v2.8b, \r0\().8b, \r1\().8b, #2
ext v3.8b, \r0\().8b, \r1\().8b, #3
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, \r0\().8b, \r1\().8b, #1
ext v5.8b, \r0\().8b, \r1\().8b, #4
uaddl v4.8h, v4.8b, v5.8b
ext v1.8b, \r0\().8b, \r1\().8b, #5
uaddl \d0\().8h, \r0\().8b, v1.8b
ext v0.8b, \r2\().8b, \r3\().8b, #2
mla \d0\().8h, v2.8h, v6.h[1]
ext v1.8b, \r2\().8b, \r3\().8b, #3
uaddl v0.8h, v0.8b, v1.8b
ext v1.8b, \r2\().8b, \r3\().8b, #1
mls \d0\().8h, v4.8h, v6.h[0]
ext v3.8b, \r2\().8b, \r3\().8b, #4
uaddl v1.8h, v1.8b, v3.8b
ext v2.8b, \r2\().8b, \r3\().8b, #5
uaddl \d1\().8h, \r2\().8b, v2.8b
mla \d1\().8h, v0.8h, v6.h[1]
mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
sqrshrun \d1\().8B, \d1\().8H, #5
sqrshrun \d0\().8b, \d0\().8h, #5
sqrshrun \d1\().8b, \d1\().8h, #5
.endif
.endm
//trashes v0-v4
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
uaddl v2.8H, \r2\().8B, \r3\().8B
uaddl v0.8H, \r3\().8B, \r4\().8B
uaddl v4.8H, \r1\().8B, \r4\().8B
uaddl v1.8H, \r2\().8B, \r5\().8B
uaddl \d0\().8H, \r0\().8B, \r5\().8B
uaddl \d1\().8H, \r1\().8B, \r6\().8B
mla \d0\().8H, v2.8H, v6.H[1]
mls \d0\().8H, v4.8H, v6.H[0]
mla \d1\().8H, v0.8H, v6.H[1]
mls \d1\().8H, v1.8H, v6.H[0]
uaddl v2.8h, \r2\().8b, \r3\().8b
uaddl v0.8h, \r3\().8b, \r4\().8b
uaddl v4.8h, \r1\().8b, \r4\().8b
uaddl v1.8h, \r2\().8b, \r5\().8b
uaddl \d0\().8h, \r0\().8b, \r5\().8b
uaddl \d1\().8h, \r1\().8b, \r6\().8b
mla \d0\().8h, v2.8h, v6.h[1]
mls \d0\().8h, v4.8h, v6.h[0]
mla \d1\().8h, v0.8h, v6.h[1]
mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
sqrshrun \d1\().8B, \d1\().8H, #5
sqrshrun \d0\().8b, \d0\().8h, #5
sqrshrun \d1\().8b, \d1\().8h, #5
.endif
.endm
//trashes v0-v5, v7, v30-v31
.macro lowpass_8H r0, r1
ext v0.16B, \r0\().16B, \r0\().16B, #2
ext v1.16B, \r0\().16B, \r0\().16B, #3
uaddl v0.8H, v0.8B, v1.8B
ext v2.16B, \r0\().16B, \r0\().16B, #1
ext v3.16B, \r0\().16B, \r0\().16B, #4
uaddl v2.8H, v2.8B, v3.8B
ext v30.16B, \r0\().16B, \r0\().16B, #5
uaddl \r0\().8H, \r0\().8B, v30.8B
ext v4.16B, \r1\().16B, \r1\().16B, #2
mla \r0\().8H, v0.8H, v6.H[1]
ext v5.16B, \r1\().16B, \r1\().16B, #3
uaddl v4.8H, v4.8B, v5.8B
ext v7.16B, \r1\().16B, \r1\().16B, #1
mls \r0\().8H, v2.8H, v6.H[0]
ext v0.16B, \r1\().16B, \r1\().16B, #4
uaddl v7.8H, v7.8B, v0.8B
ext v31.16B, \r1\().16B, \r1\().16B, #5
uaddl \r1\().8H, \r1\().8B, v31.8B
mla \r1\().8H, v4.8H, v6.H[1]
mls \r1\().8H, v7.8H, v6.H[0]
ext v0.16b, \r0\().16b, \r0\().16b, #2
ext v1.16b, \r0\().16b, \r0\().16b, #3
uaddl v0.8h, v0.8b, v1.8b
ext v2.16b, \r0\().16b, \r0\().16b, #1
ext v3.16b, \r0\().16b, \r0\().16b, #4
uaddl v2.8h, v2.8b, v3.8b
ext v30.16b, \r0\().16b, \r0\().16b, #5
uaddl \r0\().8h, \r0\().8b, v30.8b
ext v4.16b, \r1\().16b, \r1\().16b, #2
mla \r0\().8h, v0.8h, v6.h[1]
ext v5.16b, \r1\().16b, \r1\().16b, #3
uaddl v4.8h, v4.8b, v5.8b
ext v7.16b, \r1\().16b, \r1\().16b, #1
mls \r0\().8h, v2.8h, v6.h[0]
ext v0.16b, \r1\().16b, \r1\().16b, #4
uaddl v7.8h, v7.8b, v0.8b
ext v31.16b, \r1\().16b, \r1\().16b, #5
uaddl \r1\().8h, \r1\().8b, v31.8b
mla \r1\().8h, v4.8h, v6.h[1]
mls \r1\().8h, v7.8h, v6.h[0]
.endm
// trashes v2-v5, v30
.macro lowpass_8_1 r0, r1, d0, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3
uaddl v2.8H, v2.8B, v3.8B
ext v4.8B, \r0\().8B, \r1\().8B, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4
uaddl v4.8H, v4.8B, v5.8B
ext v30.8B, \r0\().8B, \r1\().8B, #5
uaddl \d0\().8H, \r0\().8B, v30.8B
mla \d0\().8H, v2.8H, v6.H[1]
mls \d0\().8H, v4.8H, v6.H[0]
ext v2.8b, \r0\().8b, \r1\().8b, #2
ext v3.8b, \r0\().8b, \r1\().8b, #3
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, \r0\().8b, \r1\().8b, #1
ext v5.8b, \r0\().8b, \r1\().8b, #4
uaddl v4.8h, v4.8b, v5.8b
ext v30.8b, \r0\().8b, \r1\().8b, #5
uaddl \d0\().8h, \r0\().8b, v30.8b
mla \d0\().8h, v2.8h, v6.h[1]
mls \d0\().8h, v4.8h, v6.h[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
sqrshrun \d0\().8b, \d0\().8h, #5
.endif
.endm
// trashed v0-v7
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5
saddl v5.4S, \r2\().4H, \r3\().4H
saddl2 v1.4S, \r2\().8H, \r3\().8H
saddl v6.4S, \r1\().4H, \r4\().4H
saddl2 v2.4S, \r1\().8H, \r4\().8H
saddl v0.4S, \r0\().4H, \r5\().4H
saddl2 v4.4S, \r0\().8H, \r5\().8H
saddl v5.4s, \r2\().4h, \r3\().4h
saddl2 v1.4s, \r2\().8h, \r3\().8h
saddl v6.4s, \r1\().4h, \r4\().4h
saddl2 v2.4s, \r1\().8h, \r4\().8h
saddl v0.4s, \r0\().4h, \r5\().4h
saddl2 v4.4s, \r0\().8h, \r5\().8h
shl v3.4S, v5.4S, #4
shl v5.4S, v5.4S, #2
shl v7.4S, v6.4S, #2
add v5.4S, v5.4S, v3.4S
add v6.4S, v6.4S, v7.4S
shl v3.4s, v5.4s, #4
shl v5.4s, v5.4s, #2
shl v7.4s, v6.4s, #2
add v5.4s, v5.4s, v3.4s
add v6.4s, v6.4s, v7.4s
shl v3.4S, v1.4S, #4
shl v1.4S, v1.4S, #2
shl v7.4S, v2.4S, #2
add v1.4S, v1.4S, v3.4S
add v2.4S, v2.4S, v7.4S
shl v3.4s, v1.4s, #4
shl v1.4s, v1.4s, #2
shl v7.4s, v2.4s, #2
add v1.4s, v1.4s, v3.4s
add v2.4s, v2.4s, v7.4s
add v5.4S, v5.4S, v0.4S
sub v5.4S, v5.4S, v6.4S
add v5.4s, v5.4s, v0.4s
sub v5.4s, v5.4s, v6.4s
add v1.4S, v1.4S, v4.4S
sub v1.4S, v1.4S, v2.4S
add v1.4s, v1.4s, v4.4s
sub v1.4s, v1.4s, v2.4s
rshrn v5.4H, v5.4S, #10
rshrn2 v5.8H, v1.4S, #10
rshrn v5.4h, v5.4s, #10
rshrn2 v5.8h, v1.4s, #10
sqxtun \r0\().8B, v5.8H
sqxtun \r0\().8b, v5.8h
.endm
function put_h264_qpel16_h_lowpass_neon_packed
@ -176,19 +176,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
endfunc
function \type\()_h264_qpel8_h_lowpass_neon
1: ld1 {v28.8B, v29.8B}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2
1: ld1 {v28.8b, v29.8b}, [x1], x2
ld1 {v16.8b, v17.8b}, [x1], x2
subs x12, x12, #2
lowpass_8 v28, v29, v16, v17, v28, v16
.ifc \type,avg
ld1 {v2.8B}, [x0], x3
ld1 {v3.8B}, [x0]
urhadd v28.8B, v28.8B, v2.8B
urhadd v16.8B, v16.8B, v3.8B
ld1 {v2.8b}, [x0], x3
ld1 {v3.8b}, [x0]
urhadd v28.8b, v28.8b, v2.8b
urhadd v16.8b, v16.8b, v3.8b
sub x0, x0, x3
.endif
st1 {v28.8B}, [x0], x3
st1 {v16.8B}, [x0], x3
st1 {v28.8b}, [x0], x3
st1 {v16.8b}, [x0], x3
b.ne 1b
ret
endfunc
@ -213,23 +213,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon
1: ld1 {v26.8B, v27.8B}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2
ld1 {v28.8B}, [x3], x2
ld1 {v29.8B}, [x3], x2
1: ld1 {v26.8b, v27.8b}, [x1], x2
ld1 {v16.8b, v17.8b}, [x1], x2
ld1 {v28.8b}, [x3], x2
ld1 {v29.8b}, [x3], x2
subs x12, x12, #2
lowpass_8 v26, v27, v16, v17, v26, v27
urhadd v26.8B, v26.8B, v28.8B
urhadd v27.8B, v27.8B, v29.8B
urhadd v26.8b, v26.8b, v28.8b
urhadd v27.8b, v27.8b, v29.8b
.ifc \type,avg
ld1 {v2.8B}, [x0], x2
ld1 {v3.8B}, [x0]
urhadd v26.8B, v26.8B, v2.8B
urhadd v27.8B, v27.8B, v3.8B
ld1 {v2.8b}, [x0], x2
ld1 {v3.8b}, [x0]
urhadd v26.8b, v26.8b, v2.8b
urhadd v27.8b, v27.8b, v3.8b
sub x0, x0, x2
.endif
st1 {v26.8B}, [x0], x2
st1 {v27.8B}, [x0], x2
st1 {v26.8b}, [x0], x2
st1 {v27.8b}, [x0], x2
b.ne 1b
ret
endfunc
@ -270,52 +270,52 @@ function \type\()_h264_qpel16_v_lowpass_neon
endfunc
function \type\()_h264_qpel8_v_lowpass_neon
ld1 {v16.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v25.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v27.8B}, [x1], x3
ld1 {v28.8B}, [x1]
ld1 {v16.8b}, [x1], x3
ld1 {v17.8b}, [x1], x3
ld1 {v18.8b}, [x1], x3
ld1 {v19.8b}, [x1], x3
ld1 {v20.8b}, [x1], x3
ld1 {v21.8b}, [x1], x3
ld1 {v22.8b}, [x1], x3
ld1 {v23.8b}, [x1], x3
ld1 {v24.8b}, [x1], x3
ld1 {v25.8b}, [x1], x3
ld1 {v26.8b}, [x1], x3
ld1 {v27.8b}, [x1], x3
ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
.ifc \type,avg
ld1 {v24.8B}, [x0], x2
ld1 {v25.8B}, [x0], x2
ld1 {v26.8B}, [x0], x2
urhadd v16.8B, v16.8B, v24.8B
ld1 {v27.8B}, [x0], x2
urhadd v17.8B, v17.8B, v25.8B
ld1 {v28.8B}, [x0], x2
urhadd v18.8B, v18.8B, v26.8B
ld1 {v29.8B}, [x0], x2
urhadd v19.8B, v19.8B, v27.8B
ld1 {v30.8B}, [x0], x2
urhadd v20.8B, v20.8B, v28.8B
ld1 {v31.8B}, [x0], x2
urhadd v21.8B, v21.8B, v29.8B
urhadd v22.8B, v22.8B, v30.8B
urhadd v23.8B, v23.8B, v31.8B
ld1 {v24.8b}, [x0], x2
ld1 {v25.8b}, [x0], x2
ld1 {v26.8b}, [x0], x2
urhadd v16.8b, v16.8b, v24.8b
ld1 {v27.8b}, [x0], x2
urhadd v17.8b, v17.8b, v25.8b
ld1 {v28.8b}, [x0], x2
urhadd v18.8b, v18.8b, v26.8b
ld1 {v29.8b}, [x0], x2
urhadd v19.8b, v19.8b, v27.8b
ld1 {v30.8b}, [x0], x2
urhadd v20.8b, v20.8b, v28.8b
ld1 {v31.8b}, [x0], x2
urhadd v21.8b, v21.8b, v29.8b
urhadd v22.8b, v22.8b, v30.8b
urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x2, lsl #3
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v18.8B}, [x0], x2
st1 {v19.8B}, [x0], x2
st1 {v20.8B}, [x0], x2
st1 {v21.8B}, [x0], x2
st1 {v22.8B}, [x0], x2
st1 {v23.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
st1 {v18.8b}, [x0], x2
st1 {v19.8b}, [x0], x2
st1 {v20.8b}, [x0], x2
st1 {v21.8b}, [x0], x2
st1 {v22.8b}, [x0], x2
st1 {v23.8b}, [x0], x2
ret
endfunc
@ -343,70 +343,70 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
ld1 {v16.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v25.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v27.8B}, [x1], x3
ld1 {v28.8B}, [x1]
ld1 {v16.8b}, [x1], x3
ld1 {v17.8b}, [x1], x3
ld1 {v18.8b}, [x1], x3
ld1 {v19.8b}, [x1], x3
ld1 {v20.8b}, [x1], x3
ld1 {v21.8b}, [x1], x3
ld1 {v22.8b}, [x1], x3
ld1 {v23.8b}, [x1], x3
ld1 {v24.8b}, [x1], x3
ld1 {v25.8b}, [x1], x3
ld1 {v26.8b}, [x1], x3
ld1 {v27.8b}, [x1], x3
ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
ld1 {v24.8B}, [x12], x2
ld1 {v25.8B}, [x12], x2
ld1 {v26.8B}, [x12], x2
ld1 {v27.8B}, [x12], x2
ld1 {v28.8B}, [x12], x2
urhadd v16.8B, v24.8B, v16.8B
urhadd v17.8B, v25.8B, v17.8B
ld1 {v29.8B}, [x12], x2
urhadd v18.8B, v26.8B, v18.8B
urhadd v19.8B, v27.8B, v19.8B
ld1 {v30.8B}, [x12], x2
urhadd v20.8B, v28.8B, v20.8B
urhadd v21.8B, v29.8B, v21.8B
ld1 {v31.8B}, [x12], x2
urhadd v22.8B, v30.8B, v22.8B
urhadd v23.8B, v31.8B, v23.8B
ld1 {v24.8b}, [x12], x2
ld1 {v25.8b}, [x12], x2
ld1 {v26.8b}, [x12], x2
ld1 {v27.8b}, [x12], x2
ld1 {v28.8b}, [x12], x2
urhadd v16.8b, v24.8b, v16.8b
urhadd v17.8b, v25.8b, v17.8b
ld1 {v29.8b}, [x12], x2
urhadd v18.8b, v26.8b, v18.8b
urhadd v19.8b, v27.8b, v19.8b
ld1 {v30.8b}, [x12], x2
urhadd v20.8b, v28.8b, v20.8b
urhadd v21.8b, v29.8b, v21.8b
ld1 {v31.8b}, [x12], x2
urhadd v22.8b, v30.8b, v22.8b
urhadd v23.8b, v31.8b, v23.8b
.ifc \type,avg
ld1 {v24.8B}, [x0], x3
ld1 {v25.8B}, [x0], x3
ld1 {v26.8B}, [x0], x3
urhadd v16.8B, v16.8B, v24.8B
ld1 {v27.8B}, [x0], x3
urhadd v17.8B, v17.8B, v25.8B
ld1 {v28.8B}, [x0], x3
urhadd v18.8B, v18.8B, v26.8B
ld1 {v29.8B}, [x0], x3
urhadd v19.8B, v19.8B, v27.8B
ld1 {v30.8B}, [x0], x3
urhadd v20.8B, v20.8B, v28.8B
ld1 {v31.8B}, [x0], x3
urhadd v21.8B, v21.8B, v29.8B
urhadd v22.8B, v22.8B, v30.8B
urhadd v23.8B, v23.8B, v31.8B
ld1 {v24.8b}, [x0], x3
ld1 {v25.8b}, [x0], x3
ld1 {v26.8b}, [x0], x3
urhadd v16.8b, v16.8b, v24.8b
ld1 {v27.8b}, [x0], x3
urhadd v17.8b, v17.8b, v25.8b
ld1 {v28.8b}, [x0], x3
urhadd v18.8b, v18.8b, v26.8b
ld1 {v29.8b}, [x0], x3
urhadd v19.8b, v19.8b, v27.8b
ld1 {v30.8b}, [x0], x3
urhadd v20.8b, v20.8b, v28.8b
ld1 {v31.8b}, [x0], x3
urhadd v21.8b, v21.8b, v29.8b
urhadd v22.8b, v22.8b, v30.8b
urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x3, lsl #3
.endif
st1 {v16.8B}, [x0], x3
st1 {v17.8B}, [x0], x3
st1 {v18.8B}, [x0], x3
st1 {v19.8B}, [x0], x3
st1 {v20.8B}, [x0], x3
st1 {v21.8B}, [x0], x3
st1 {v22.8B}, [x0], x3
st1 {v23.8B}, [x0], x3
st1 {v16.8b}, [x0], x3
st1 {v17.8b}, [x0], x3
st1 {v18.8b}, [x0], x3
st1 {v19.8b}, [x0], x3
st1 {v20.8b}, [x0], x3
st1 {v21.8b}, [x0], x3
st1 {v22.8b}, [x0], x3
st1 {v23.8b}, [x0], x3
ret
endfunc
@ -417,19 +417,19 @@ endfunc
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const w12
ld1 {v16.8H}, [x1], x3
ld1 {v17.8H}, [x1], x3
ld1 {v18.8H}, [x1], x3
ld1 {v19.8H}, [x1], x3
ld1 {v20.8H}, [x1], x3
ld1 {v21.8H}, [x1], x3
ld1 {v22.8H}, [x1], x3
ld1 {v23.8H}, [x1], x3
ld1 {v24.8H}, [x1], x3
ld1 {v25.8H}, [x1], x3
ld1 {v26.8H}, [x1], x3
ld1 {v27.8H}, [x1], x3
ld1 {v28.8H}, [x1]
ld1 {v16.8h}, [x1], x3
ld1 {v17.8h}, [x1], x3
ld1 {v18.8h}, [x1], x3
ld1 {v19.8h}, [x1], x3
ld1 {v20.8h}, [x1], x3
ld1 {v21.8h}, [x1], x3
ld1 {v22.8h}, [x1], x3
ld1 {v23.8h}, [x1], x3
ld1 {v24.8h}, [x1], x3
ld1 {v25.8h}, [x1], x3
ld1 {v26.8h}, [x1], x3
ld1 {v27.8h}, [x1], x3
ld1 {v28.8h}, [x1]
lowpass_8H v16, v17
lowpass_8H v18, v19
lowpass_8H v20, v21
@ -458,33 +458,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg
ld1 {v0.8B}, [x0], x2
ld1 {v1.8B}, [x0], x2
ld1 {v2.8B}, [x0], x2
urhadd v16.8B, v16.8B, v0.8B
ld1 {v3.8B}, [x0], x2
urhadd v17.8B, v17.8B, v1.8B
ld1 {v4.8B}, [x0], x2
urhadd v18.8B, v18.8B, v2.8B
ld1 {v5.8B}, [x0], x2
urhadd v19.8B, v19.8B, v3.8B
ld1 {v6.8B}, [x0], x2
urhadd v20.8B, v20.8B, v4.8B
ld1 {v7.8B}, [x0], x2
urhadd v21.8B, v21.8B, v5.8B
urhadd v22.8B, v22.8B, v6.8B
urhadd v23.8B, v23.8B, v7.8B
ld1 {v0.8b}, [x0], x2
ld1 {v1.8b}, [x0], x2
ld1 {v2.8b}, [x0], x2
urhadd v16.8b, v16.8b, v0.8b
ld1 {v3.8b}, [x0], x2
urhadd v17.8b, v17.8b, v1.8b
ld1 {v4.8b}, [x0], x2
urhadd v18.8b, v18.8b, v2.8b
ld1 {v5.8b}, [x0], x2
urhadd v19.8b, v19.8b, v3.8b
ld1 {v6.8b}, [x0], x2
urhadd v20.8b, v20.8b, v4.8b
ld1 {v7.8b}, [x0], x2
urhadd v21.8b, v21.8b, v5.8b
urhadd v22.8b, v22.8b, v6.8b
urhadd v23.8b, v23.8b, v7.8b
sub x0, x0, x2, lsl #3
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v18.8B}, [x0], x2
st1 {v19.8B}, [x0], x2
st1 {v20.8B}, [x0], x2
st1 {v21.8B}, [x0], x2
st1 {v22.8B}, [x0], x2
st1 {v23.8B}, [x0], x2
st1 {v16.8b}, [x0], x2
st1 {v17.8b}, [x0], x2
st1 {v18.8b}, [x0], x2
st1 {v19.8b}, [x0], x2
st1 {v20.8b}, [x0], x2
st1 {v21.8b}, [x0], x2
st1 {v22.8b}, [x0], x2
st1 {v23.8b}, [x0], x2
ret x10
endfunc
@ -498,45 +498,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
ld1 {v0.8B, v1.8B}, [x2], #16
ld1 {v2.8B, v3.8B}, [x2], #16
urhadd v0.8B, v0.8B, v16.8B
urhadd v1.8B, v1.8B, v17.8B
ld1 {v4.8B, v5.8B}, [x2], #16
urhadd v2.8B, v2.8B, v18.8B
urhadd v3.8B, v3.8B, v19.8B
ld1 {v6.8B, v7.8B}, [x2], #16
urhadd v4.8B, v4.8B, v20.8B
urhadd v5.8B, v5.8B, v21.8B
urhadd v6.8B, v6.8B, v22.8B
urhadd v7.8B, v7.8B, v23.8B
ld1 {v0.8b, v1.8b}, [x2], #16
ld1 {v2.8b, v3.8b}, [x2], #16
urhadd v0.8b, v0.8b, v16.8b
urhadd v1.8b, v1.8b, v17.8b
ld1 {v4.8b, v5.8b}, [x2], #16
urhadd v2.8b, v2.8b, v18.8b
urhadd v3.8b, v3.8b, v19.8b
ld1 {v6.8b, v7.8b}, [x2], #16
urhadd v4.8b, v4.8b, v20.8b
urhadd v5.8b, v5.8b, v21.8b
urhadd v6.8b, v6.8b, v22.8b
urhadd v7.8b, v7.8b, v23.8b
.ifc \type,avg
ld1 {v16.8B}, [x0], x3
ld1 {v17.8B}, [x0], x3
ld1 {v18.8B}, [x0], x3
urhadd v0.8B, v0.8B, v16.8B
ld1 {v19.8B}, [x0], x3
urhadd v1.8B, v1.8B, v17.8B
ld1 {v20.8B}, [x0], x3
urhadd v2.8B, v2.8B, v18.8B
ld1 {v21.8B}, [x0], x3
urhadd v3.8B, v3.8B, v19.8B
ld1 {v22.8B}, [x0], x3
urhadd v4.8B, v4.8B, v20.8B
ld1 {v23.8B}, [x0], x3
urhadd v5.8B, v5.8B, v21.8B
urhadd v6.8B, v6.8B, v22.8B
urhadd v7.8B, v7.8B, v23.8B
ld1 {v16.8b}, [x0], x3
ld1 {v17.8b}, [x0], x3
ld1 {v18.8b}, [x0], x3
urhadd v0.8b, v0.8b, v16.8b
ld1 {v19.8b}, [x0], x3
urhadd v1.8b, v1.8b, v17.8b
ld1 {v20.8b}, [x0], x3
urhadd v2.8b, v2.8b, v18.8b
ld1 {v21.8b}, [x0], x3
urhadd v3.8b, v3.8b, v19.8b
ld1 {v22.8b}, [x0], x3
urhadd v4.8b, v4.8b, v20.8b
ld1 {v23.8b}, [x0], x3
urhadd v5.8b, v5.8b, v21.8b
urhadd v6.8b, v6.8b, v22.8b
urhadd v7.8b, v7.8b, v23.8b
sub x0, x0, x3, lsl #3
.endif
st1 {v0.8B}, [x0], x3
st1 {v1.8B}, [x0], x3
st1 {v2.8B}, [x0], x3
st1 {v3.8B}, [x0], x3
st1 {v4.8B}, [x0], x3
st1 {v5.8B}, [x0], x3
st1 {v6.8B}, [x0], x3
st1 {v7.8B}, [x0], x3
st1 {v0.8b}, [x0], x3
st1 {v1.8b}, [x0], x3
st1 {v2.8b}, [x0], x3
st1 {v3.8b}, [x0], x3
st1 {v4.8b}, [x0], x3
st1 {v5.8b}, [x0], x3
st1 {v6.8b}, [x0], x3
st1 {v7.8b}, [x0], x3
ret x10
endfunc

View File

@ -26,295 +26,295 @@
.if \avg
mov x12, x0
.endif
1: ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
ld1 {v2.16B}, [x1], x2
ld1 {v3.16B}, [x1], x2
1: ld1 {v0.16b}, [x1], x2
ld1 {v1.16b}, [x1], x2
ld1 {v2.16b}, [x1], x2
ld1 {v3.16b}, [x1], x2
.if \avg
ld1 {v4.16B}, [x12], x2
urhadd v0.16B, v0.16B, v4.16B
ld1 {v5.16B}, [x12], x2
urhadd v1.16B, v1.16B, v5.16B
ld1 {v6.16B}, [x12], x2
urhadd v2.16B, v2.16B, v6.16B
ld1 {v7.16B}, [x12], x2
urhadd v3.16B, v3.16B, v7.16B
ld1 {v4.16b}, [x12], x2
urhadd v0.16b, v0.16b, v4.16b
ld1 {v5.16b}, [x12], x2
urhadd v1.16b, v1.16b, v5.16b
ld1 {v6.16b}, [x12], x2
urhadd v2.16b, v2.16b, v6.16b
ld1 {v7.16b}, [x12], x2
urhadd v3.16b, v3.16b, v7.16b
.endif
subs w3, w3, #4
st1 {v0.16B}, [x0], x2
st1 {v1.16B}, [x0], x2
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
st1 {v0.16b}, [x0], x2
st1 {v1.16b}, [x0], x2
st1 {v2.16b}, [x0], x2
st1 {v3.16b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_x2 rnd=1, avg=0
1: ld1 {v0.16B, v1.16B}, [x1], x2
ld1 {v2.16B, v3.16B}, [x1], x2
1: ld1 {v0.16b, v1.16b}, [x1], x2
ld1 {v2.16b, v3.16b}, [x1], x2
subs w3, w3, #2
ext v1.16B, v0.16B, v1.16B, #1
avg v0.16B, v0.16B, v1.16B
ext v3.16B, v2.16B, v3.16B, #1
avg v2.16B, v2.16B, v3.16B
ext v1.16b, v0.16b, v1.16b, #1
avg v0.16b, v0.16b, v1.16b
ext v3.16b, v2.16b, v3.16b, #1
avg v2.16b, v2.16b, v3.16b
.if \avg
ld1 {v1.16B}, [x0], x2
ld1 {v3.16B}, [x0]
urhadd v0.16B, v0.16B, v1.16B
urhadd v2.16B, v2.16B, v3.16B
ld1 {v1.16b}, [x0], x2
ld1 {v3.16b}, [x0]
urhadd v0.16b, v0.16b, v1.16b
urhadd v2.16b, v2.16b, v3.16b
sub x0, x0, x2
.endif
st1 {v0.16B}, [x0], x2
st1 {v2.16B}, [x0], x2
st1 {v0.16b}, [x0], x2
st1 {v2.16b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_y2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
ld1 {v0.16b}, [x1], x2
ld1 {v1.16b}, [x1], x2
1: subs w3, w3, #2
avg v2.16B, v0.16B, v1.16B
ld1 {v0.16B}, [x1], x2
avg v3.16B, v0.16B, v1.16B
ld1 {v1.16B}, [x1], x2
avg v2.16b, v0.16b, v1.16b
ld1 {v0.16b}, [x1], x2
avg v3.16b, v0.16b, v1.16b
ld1 {v1.16b}, [x1], x2
.if \avg
ld1 {v4.16B}, [x0], x2
ld1 {v5.16B}, [x0]
urhadd v2.16B, v2.16B, v4.16B
urhadd v3.16B, v3.16B, v5.16B
ld1 {v4.16b}, [x0], x2
ld1 {v5.16b}, [x0]
urhadd v2.16b, v2.16b, v4.16b
urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2
.endif
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
st1 {v2.16b}, [x0], x2
st1 {v3.16b}, [x0], x2
b.ne 1b
avg v2.16B, v0.16B, v1.16B
ld1 {v0.16B}, [x1], x2
avg v3.16B, v0.16B, v1.16B
avg v2.16b, v0.16b, v1.16b
ld1 {v0.16b}, [x1], x2
avg v3.16b, v0.16b, v1.16b
.if \avg
ld1 {v4.16B}, [x0], x2
ld1 {v5.16B}, [x0]
urhadd v2.16B, v2.16B, v4.16B
urhadd v3.16B, v3.16B, v5.16B
ld1 {v4.16b}, [x0], x2
ld1 {v5.16b}, [x0]
urhadd v2.16b, v2.16b, v4.16b
urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2
.endif
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
st1 {v2.16b}, [x0], x2
st1 {v3.16b}, [x0], x2
ret
.endm
.macro pixels16_xy2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2
ld1 {v4.16B, v5.16B}, [x1], x2
ld1 {v0.16b, v1.16b}, [x1], x2
ld1 {v4.16b, v5.16b}, [x1], x2
NRND movi v26.8H, #1
ext v1.16B, v0.16B, v1.16B, #1
ext v5.16B, v4.16B, v5.16B, #1
uaddl v16.8H, v0.8B, v1.8B
uaddl2 v20.8H, v0.16B, v1.16B
uaddl v18.8H, v4.8B, v5.8B
uaddl2 v22.8H, v4.16B, v5.16B
ext v1.16b, v0.16b, v1.16b, #1
ext v5.16b, v4.16b, v5.16b, #1
uaddl v16.8h, v0.8b, v1.8b
uaddl2 v20.8h, v0.16b, v1.16b
uaddl v18.8h, v4.8b, v5.8b
uaddl2 v22.8h, v4.16b, v5.16b
1: subs w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2
add v24.8H, v16.8H, v18.8H
ld1 {v0.16b, v1.16b}, [x1], x2
add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1
add v1.8H, v20.8H, v22.8H
mshrn v28.8B, v24.8H, #2
ext v30.16b, v0.16b, v1.16b, #1
add v1.8h, v20.8h, v22.8h
mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2
mshrn2 v28.16b, v1.8h, #2
.if \avg
ld1 {v16.16B}, [x0]
urhadd v28.16B, v28.16B, v16.16B
ld1 {v16.16b}, [x0]
urhadd v28.16b, v28.16b, v16.16b
.endif
uaddl v16.8H, v0.8B, v30.8B
ld1 {v2.16B, v3.16B}, [x1], x2
uaddl2 v20.8H, v0.16B, v30.16B
st1 {v28.16B}, [x0], x2
add v24.8H, v16.8H, v18.8H
uaddl v16.8h, v0.8b, v30.8b
ld1 {v2.16b, v3.16b}, [x1], x2
uaddl2 v20.8h, v0.16b, v30.16b
st1 {v28.16b}, [x0], x2
add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
ext v3.16B, v2.16B, v3.16B, #1
add v0.8H, v20.8H, v22.8H
mshrn v30.8B, v24.8H, #2
ext v3.16b, v2.16b, v3.16b, #1
add v0.8h, v20.8h, v22.8h
mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2
mshrn2 v30.16b, v0.8h, #2
.if \avg
ld1 {v18.16B}, [x0]
urhadd v30.16B, v30.16B, v18.16B
ld1 {v18.16b}, [x0]
urhadd v30.16b, v30.16b, v18.16b
.endif
uaddl v18.8H, v2.8B, v3.8B
uaddl2 v22.8H, v2.16B, v3.16B
st1 {v30.16B}, [x0], x2
uaddl v18.8h, v2.8b, v3.8b
uaddl2 v22.8h, v2.16b, v3.16b
st1 {v30.16b}, [x0], x2
b.gt 1b
ld1 {v0.16B, v1.16B}, [x1], x2
add v24.8H, v16.8H, v18.8H
ld1 {v0.16b, v1.16b}, [x1], x2
add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1
add v1.8H, v20.8H, v22.8H
mshrn v28.8B, v24.8H, #2
ext v30.16b, v0.16b, v1.16b, #1
add v1.8h, v20.8h, v22.8h
mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2
mshrn2 v28.16b, v1.8h, #2
.if \avg
ld1 {v16.16B}, [x0]
urhadd v28.16B, v28.16B, v16.16B
ld1 {v16.16b}, [x0]
urhadd v28.16b, v28.16b, v16.16b
.endif
uaddl v16.8H, v0.8B, v30.8B
uaddl2 v20.8H, v0.16B, v30.16B
st1 {v28.16B}, [x0], x2
add v24.8H, v16.8H, v18.8H
uaddl v16.8h, v0.8b, v30.8b
uaddl2 v20.8h, v0.16b, v30.16b
st1 {v28.16b}, [x0], x2
add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H
add v0.8H, v20.8H, v22.8H
mshrn v30.8B, v24.8H, #2
add v0.8h, v20.8h, v22.8h
mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2
mshrn2 v30.16b, v0.8h, #2
.if \avg
ld1 {v18.16B}, [x0]
urhadd v30.16B, v30.16B, v18.16B
ld1 {v18.16b}, [x0]
urhadd v30.16b, v30.16b, v18.16b
.endif
st1 {v30.16B}, [x0], x2
st1 {v30.16b}, [x0], x2
ret
.endm
.macro pixels8 rnd=1, avg=0
1: ld1 {v0.8B}, [x1], x2
ld1 {v1.8B}, [x1], x2
ld1 {v2.8B}, [x1], x2
ld1 {v3.8B}, [x1], x2
1: ld1 {v0.8b}, [x1], x2
ld1 {v1.8b}, [x1], x2
ld1 {v2.8b}, [x1], x2
ld1 {v3.8b}, [x1], x2
.if \avg
ld1 {v4.8B}, [x0], x2
urhadd v0.8B, v0.8B, v4.8B
ld1 {v5.8B}, [x0], x2
urhadd v1.8B, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
urhadd v2.8B, v2.8B, v6.8B
ld1 {v7.8B}, [x0], x2
urhadd v3.8B, v3.8B, v7.8B
ld1 {v4.8b}, [x0], x2
urhadd v0.8b, v0.8b, v4.8b
ld1 {v5.8b}, [x0], x2
urhadd v1.8b, v1.8b, v5.8b
ld1 {v6.8b}, [x0], x2
urhadd v2.8b, v2.8b, v6.8b
ld1 {v7.8b}, [x0], x2
urhadd v3.8b, v3.8b, v7.8b
sub x0, x0, x2, lsl #2
.endif
subs w3, w3, #4
st1 {v0.8B}, [x0], x2
st1 {v1.8B}, [x0], x2
st1 {v2.8B}, [x0], x2
st1 {v3.8B}, [x0], x2
st1 {v0.8b}, [x0], x2
st1 {v1.8b}, [x0], x2
st1 {v2.8b}, [x0], x2
st1 {v3.8b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_x2 rnd=1, avg=0
1: ld1 {v0.8B, v1.8B}, [x1], x2
ext v1.8B, v0.8B, v1.8B, #1
ld1 {v2.8B, v3.8B}, [x1], x2
ext v3.8B, v2.8B, v3.8B, #1
1: ld1 {v0.8b, v1.8b}, [x1], x2
ext v1.8b, v0.8b, v1.8b, #1
ld1 {v2.8b, v3.8b}, [x1], x2
ext v3.8b, v2.8b, v3.8b, #1
subs w3, w3, #2
avg v0.8B, v0.8B, v1.8B
avg v2.8B, v2.8B, v3.8B
avg v0.8b, v0.8b, v1.8b
avg v2.8b, v2.8b, v3.8b
.if \avg
ld1 {v4.8B}, [x0], x2
ld1 {v5.8B}, [x0]
urhadd v0.8B, v0.8B, v4.8B
urhadd v2.8B, v2.8B, v5.8B
ld1 {v4.8b}, [x0], x2
ld1 {v5.8b}, [x0]
urhadd v0.8b, v0.8b, v4.8b
urhadd v2.8b, v2.8b, v5.8b
sub x0, x0, x2
.endif
st1 {v0.8B}, [x0], x2
st1 {v2.8B}, [x0], x2
st1 {v0.8b}, [x0], x2
st1 {v2.8b}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_y2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.8B}, [x1], x2
ld1 {v1.8B}, [x1], x2
ld1 {v0.8b}, [x1], x2
ld1 {v1.8b}, [x1], x2
1: subs w3, w3, #2
avg v4.8B, v0.8B, v1.8B
ld1 {v0.8B}, [x1], x2
avg v5.8B, v0.8B, v1.8B
ld1 {v1.8B}, [x1], x2
avg v4.8b, v0.8b, v1.8b
ld1 {v0.8b}, [x1], x2
avg v5.8b, v0.8b, v1.8b
ld1 {v1.8b}, [x1], x2
.if \avg
ld1 {v2.8B}, [x0], x2
ld1 {v3.8B}, [x0]
urhadd v4.8B, v4.8B, v2.8B
urhadd v5.8B, v5.8B, v3.8B
ld1 {v2.8b}, [x0], x2
ld1 {v3.8b}, [x0]
urhadd v4.8b, v4.8b, v2.8b
urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2
.endif
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
st1 {v4.8b}, [x0], x2
st1 {v5.8b}, [x0], x2
b.ne 1b
avg v4.8B, v0.8B, v1.8B
ld1 {v0.8B}, [x1], x2
avg v5.8B, v0.8B, v1.8B
avg v4.8b, v0.8b, v1.8b
ld1 {v0.8b}, [x1], x2
avg v5.8b, v0.8b, v1.8b
.if \avg
ld1 {v2.8B}, [x0], x2
ld1 {v3.8B}, [x0]
urhadd v4.8B, v4.8B, v2.8B
urhadd v5.8B, v5.8B, v3.8B
ld1 {v2.8b}, [x0], x2
ld1 {v3.8b}, [x0]
urhadd v4.8b, v4.8b, v2.8b
urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2
.endif
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
st1 {v4.8b}, [x0], x2
st1 {v5.8b}, [x0], x2
ret
.endm
.macro pixels8_xy2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
ld1 {v0.16b}, [x1], x2
ld1 {v1.16b}, [x1], x2
NRND movi v19.8H, #1
ext v4.16B, v0.16B, v4.16B, #1
ext v6.16B, v1.16B, v6.16B, #1
uaddl v16.8H, v0.8B, v4.8B
uaddl v17.8H, v1.8B, v6.8B
ext v4.16b, v0.16b, v4.16b, #1
ext v6.16b, v1.16b, v6.16b, #1
uaddl v16.8h, v0.8b, v4.8b
uaddl v17.8h, v1.8b, v6.8b
1: subs w3, w3, #2
ld1 {v0.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
ext v4.16B, v0.16B, v4.16B, #1
ld1 {v0.16b}, [x1], x2
add v18.8h, v16.8h, v17.8h
ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B
mshrn v5.8B, v18.8H, #2
ld1 {v1.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8b, v18.8h, #2
ld1 {v1.16b}, [x1], x2
add v18.8h, v16.8h, v17.8h
.if \avg
ld1 {v7.8B}, [x0]
urhadd v5.8B, v5.8B, v7.8B
ld1 {v7.8b}, [x0]
urhadd v5.8b, v5.8b, v7.8b
.endif
NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2
mshrn v7.8B, v18.8H, #2
st1 {v5.8b}, [x0], x2
mshrn v7.8b, v18.8h, #2
.if \avg
ld1 {v5.8B}, [x0]
urhadd v7.8B, v7.8B, v5.8B
ld1 {v5.8b}, [x0]
urhadd v7.8b, v7.8b, v5.8b
.endif
ext v6.16B, v1.16B, v6.16B, #1
uaddl v17.8H, v1.8B, v6.8B
st1 {v7.8B}, [x0], x2
ext v6.16b, v1.16b, v6.16b, #1
uaddl v17.8h, v1.8b, v6.8b
st1 {v7.8b}, [x0], x2
b.gt 1b
ld1 {v0.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
ext v4.16B, v0.16B, v4.16B, #1
ld1 {v0.16b}, [x1], x2
add v18.8h, v16.8h, v17.8h
ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B
mshrn v5.8B, v18.8H, #2
add v18.8H, v16.8H, v17.8H
uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8b, v18.8h, #2
add v18.8h, v16.8h, v17.8h
.if \avg
ld1 {v7.8B}, [x0]
urhadd v5.8B, v5.8B, v7.8B
ld1 {v7.8b}, [x0]
urhadd v5.8b, v5.8b, v7.8b
.endif
NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2
mshrn v7.8B, v18.8H, #2
st1 {v5.8b}, [x0], x2
mshrn v7.8b, v18.8h, #2
.if \avg
ld1 {v5.8B}, [x0]
urhadd v7.8B, v7.8B, v5.8B
ld1 {v5.8b}, [x0]
urhadd v7.8b, v7.8b, v5.8b
.endif
st1 {v7.8B}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
.endm

View File

@ -1099,7 +1099,7 @@ function vsse_intra16_neon, export=1
cbnz w4, 2b
3:
add v16.4s, v16.4s, v17.4S
add v16.4s, v16.4s, v17.4s
uaddlv d17, v16.4s
fmov w0, s17

View File

@ -28,146 +28,146 @@
.endm
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8B, \r0\().8B, \r1\().8B
trn2 \r9\().8B, \r0\().8B, \r1\().8B
trn1 \r1\().8B, \r2\().8B, \r3\().8B
trn2 \r3\().8B, \r2\().8B, \r3\().8B
trn1 \r0\().8B, \r4\().8B, \r5\().8B
trn2 \r5\().8B, \r4\().8B, \r5\().8B
trn1 \r2\().8B, \r6\().8B, \r7\().8B
trn2 \r7\().8B, \r6\().8B, \r7\().8B
trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r4\().4H, \r0\().4H, \r2\().4H
trn2 \r2\().4H, \r0\().4H, \r2\().4H
trn1 \r6\().4H, \r5\().4H, \r7\().4H
trn2 \r7\().4H, \r5\().4H, \r7\().4H
trn1 \r5\().4H, \r9\().4H, \r3\().4H
trn2 \r9\().4H, \r9\().4H, \r3\().4H
trn1 \r3\().4H, \r8\().4H, \r1\().4H
trn2 \r8\().4H, \r8\().4H, \r1\().4H
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r0\().2S, \r3\().2S, \r4\().2S
trn2 \r4\().2S, \r3\().2S, \r4\().2S
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2S, \r5\().2S, \r6\().2S
trn2 \r5\().2S, \r5\().2S, \r6\().2S
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2S, \r8\().2S, \r2\().2S
trn1 \r2\().2S, \r8\().2S, \r2\().2S
trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2S, \r9\().2S, \r7\().2S
trn2 \r7\().2S, \r9\().2S, \r7\().2S
trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16B, \r0\().16B, \r1\().16B
trn2 \t1\().16B, \r0\().16B, \r1\().16B
trn1 \r1\().16B, \r2\().16B, \r3\().16B
trn2 \r3\().16B, \r2\().16B, \r3\().16B
trn1 \r0\().16B, \r4\().16B, \r5\().16B
trn2 \r5\().16B, \r4\().16B, \r5\().16B
trn1 \r2\().16B, \r6\().16B, \r7\().16B
trn2 \r7\().16B, \r6\().16B, \r7\().16B
trn1 \t0\().16b, \r0\().16b, \r1\().16b
trn2 \t1\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8H, \r0\().8H, \r2\().8H
trn2 \r2\().8H, \r0\().8H, \r2\().8H
trn1 \r6\().8H, \r5\().8H, \r7\().8H
trn2 \r7\().8H, \r5\().8H, \r7\().8H
trn1 \r5\().8H, \t1\().8H, \r3\().8H
trn2 \t1\().8H, \t1\().8H, \r3\().8H
trn1 \r3\().8H, \t0\().8H, \r1\().8H
trn2 \t0\().8H, \t0\().8H, \r1\().8H
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \t1\().8h, \r3\().8h
trn2 \t1\().8h, \t1\().8h, \r3\().8h
trn1 \r3\().8h, \t0\().8h, \r1\().8h
trn2 \t0\().8h, \t0\().8h, \r1\().8h
trn1 \r0\().4S, \r3\().4S, \r4\().4S
trn2 \r4\().4S, \r3\().4S, \r4\().4S
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4S, \r5\().4S, \r6\().4S
trn2 \r5\().4S, \r5\().4S, \r6\().4S
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4S, \t0\().4S, \r2\().4S
trn1 \r2\().4S, \t0\().4S, \r2\().4S
trn2 \r6\().4s, \t0\().4s, \r2\().4s
trn1 \r2\().4s, \t0\().4s, \r2\().4s
trn1 \r3\().4S, \t1\().4S, \r7\().4S
trn2 \r7\().4S, \t1\().4S, \r7\().4S
trn1 \r3\().4s, \t1\().4s, \r7\().4s
trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16B, \r0\().16B, \r1\().16B
trn2 \t5\().16B, \r0\().16B, \r1\().16B
trn1 \t6\().16B, \r2\().16B, \r3\().16B
trn2 \t7\().16B, \r2\().16B, \r3\().16B
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8H, \t4\().8H, \t6\().8H
trn2 \r2\().8H, \t4\().8H, \t6\().8H
trn1 \r1\().8H, \t5\().8H, \t7\().8H
trn2 \r3\().8H, \t5\().8H, \t7\().8H
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8B, \r0\().8B, \r1\().8B
trn2 \t5\().8B, \r0\().8B, \r1\().8B
trn1 \t6\().8B, \r2\().8B, \r3\().8B
trn2 \t7\().8B, \r2\().8B, \r3\().8B
trn1 \t4\().8b, \r0\().8b, \r1\().8b
trn2 \t5\().8b, \r0\().8b, \r1\().8b
trn1 \t6\().8b, \r2\().8b, \r3\().8b
trn2 \t7\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().4H, \t4\().4H, \t6\().4H
trn2 \r2\().4H, \t4\().4H, \t6\().4H
trn1 \r1\().4H, \t5\().4H, \t7\().4H
trn2 \r3\().4H, \t5\().4H, \t7\().4H
trn1 \r0\().4h, \t4\().4h, \t6\().4h
trn2 \r2\().4h, \t4\().4h, \t6\().4h
trn1 \r1\().4h, \t5\().4h, \t7\().4h
trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
trn1 \r6\().4H, \r2\().4H, \r3\().4H
trn2 \r7\().4H, \r2\().4H, \r3\().4H
trn1 \r4\().4h, \r0\().4h, \r1\().4h
trn2 \r5\().4h, \r0\().4h, \r1\().4h
trn1 \r6\().4h, \r2\().4h, \r3\().4h
trn2 \r7\().4h, \r2\().4h, \r3\().4h
trn1 \r0\().2S, \r4\().2S, \r6\().2S
trn2 \r2\().2S, \r4\().2S, \r6\().2S
trn1 \r1\().2S, \r5\().2S, \r7\().2S
trn2 \r3\().2S, \r5\().2S, \r7\().2S
trn1 \r0\().2s, \r4\().2s, \r6\().2s
trn2 \r2\().2s, \r4\().2s, \r6\().2s
trn1 \r1\().2s, \r5\().2s, \r7\().2s
trn2 \r3\().2s, \r5\().2s, \r7\().2s
.endm
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8H, \r0\().8H, \r1\().8H
trn2 \t5\().8H, \r0\().8H, \r1\().8H
trn1 \t6\().8H, \r2\().8H, \r3\().8H
trn2 \t7\().8H, \r2\().8H, \r3\().8H
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4S, \t4\().4S, \t6\().4S
trn2 \r2\().4S, \t4\().4S, \t6\().4S
trn1 \r1\().4S, \t5\().4S, \t7\().4S
trn2 \r3\().4S, \t5\().4S, \t7\().4S
trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8H, \r0\().8H, \r1\().8H
trn2 \r9\().8H, \r0\().8H, \r1\().8H
trn1 \r1\().8H, \r2\().8H, \r3\().8H
trn2 \r3\().8H, \r2\().8H, \r3\().8H
trn1 \r0\().8H, \r4\().8H, \r5\().8H
trn2 \r5\().8H, \r4\().8H, \r5\().8H
trn1 \r2\().8H, \r6\().8H, \r7\().8H
trn2 \r7\().8H, \r6\().8H, \r7\().8H
trn1 \r8\().8h, \r0\().8h, \r1\().8h
trn2 \r9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4S, \r0\().4S, \r2\().4S
trn2 \r2\().4S, \r0\().4S, \r2\().4S
trn1 \r6\().4S, \r5\().4S, \r7\().4S
trn2 \r7\().4S, \r5\().4S, \r7\().4S
trn1 \r5\().4S, \r9\().4S, \r3\().4S
trn2 \r9\().4S, \r9\().4S, \r3\().4S
trn1 \r3\().4S, \r8\().4S, \r1\().4S
trn2 \r8\().4S, \r8\().4S, \r1\().4S
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \r9\().4s, \r3\().4s
trn2 \r9\().4s, \r9\().4s, \r3\().4s
trn1 \r3\().4s, \r8\().4s, \r1\().4s
trn2 \r8\().4s, \r8\().4s, \r1\().4s
trn1 \r0\().2D, \r3\().2D, \r4\().2D
trn2 \r4\().2D, \r3\().2D, \r4\().2D
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2D, \r5\().2D, \r6\().2D
trn2 \r5\().2D, \r5\().2D, \r6\().2D
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2D, \r8\().2D, \r2\().2D
trn1 \r2\().2D, \r8\().2D, \r2\().2D
trn2 \r6\().2d, \r8\().2d, \r2\().2d
trn1 \r2\().2d, \r8\().2d, \r2\().2d
trn1 \r3\().2D, \r9\().2D, \r7\().2D
trn2 \r7\().2D, \r9\().2D, \r7\().2D
trn1 \r3\().2d, \r9\().2d, \r7\().2d
trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm

View File

@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
add x3, x0, #192*4
add x4, x0, #256*4
mov x5, #64
1: ld1 {v0.4S}, [x0]
ld1 {v1.4S}, [x1], #16
fadd v0.4S, v0.4S, v1.4S
ld1 {v2.4S}, [x2], #16
fadd v0.4S, v0.4S, v2.4S
ld1 {v3.4S}, [x3], #16
fadd v0.4S, v0.4S, v3.4S
ld1 {v4.4S}, [x4], #16
fadd v0.4S, v0.4S, v4.4S
st1 {v0.4S}, [x0], #16
1: ld1 {v0.4s}, [x0]
ld1 {v1.4s}, [x1], #16
fadd v0.4s, v0.4s, v1.4s
ld1 {v2.4s}, [x2], #16
fadd v0.4s, v0.4s, v2.4s
ld1 {v3.4s}, [x3], #16
fadd v0.4s, v0.4s, v3.4s
ld1 {v4.4s}, [x4], #16
fadd v0.4s, v0.4s, v4.4s
st1 {v0.4s}, [x0], #16
subs x5, x5, #4
b.gt 1b
ret
endfunc
function ff_sbr_sum_square_neon, export=1
movi v0.4S, #0
1: ld1 {v1.4S}, [x0], #16
fmla v0.4S, v1.4S, v1.4S
movi v0.4s, #0
1: ld1 {v1.4s}, [x0], #16
fmla v0.4s, v1.4s, v1.4s
subs w1, w1, #2
b.gt 1b
faddp v0.4S, v0.4S, v0.4S
faddp v0.4S, v0.4S, v0.4S
faddp v0.4s, v0.4s, v0.4s
faddp v0.4s, v0.4s, v0.4s
ret
endfunc
function ff_sbr_neg_odd_64_neon, export=1
mov x1, x0
movi v5.4S, #1<<7, lsl #24
ld2 {v0.4S, v1.4S}, [x0], #32
eor v1.16B, v1.16B, v5.16B
ld2 {v2.4S, v3.4S}, [x0], #32
movi v5.4s, #1<<7, lsl #24
ld2 {v0.4s, v1.4s}, [x0], #32
eor v1.16b, v1.16b, v5.16b
ld2 {v2.4s, v3.4s}, [x0], #32
.rept 3
st2 {v0.4S, v1.4S}, [x1], #32
eor v3.16B, v3.16B, v5.16B
ld2 {v0.4S, v1.4S}, [x0], #32
st2 {v2.4S, v3.4S}, [x1], #32
eor v1.16B, v1.16B, v5.16B
ld2 {v2.4S, v3.4S}, [x0], #32
st2 {v0.4s, v1.4s}, [x1], #32
eor v3.16b, v3.16b, v5.16b
ld2 {v0.4s, v1.4s}, [x0], #32
st2 {v2.4s, v3.4s}, [x1], #32
eor v1.16b, v1.16b, v5.16b
ld2 {v2.4s, v3.4s}, [x0], #32
.endr
eor v3.16B, v3.16B, v5.16B
st2 {v0.4S, v1.4S}, [x1], #32
st2 {v2.4S, v3.4S}, [x1], #32
eor v3.16b, v3.16b, v5.16b
st2 {v0.4s, v1.4s}, [x1], #32
st2 {v2.4s, v3.4s}, [x1], #32
ret
endfunc
@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
add x2, x0, #64*4
mov x3, #-16
mov x4, #-4
movi v6.4S, #1<<7, lsl #24
ld1 {v0.2S}, [x0], #8
st1 {v0.2S}, [x2], #8
movi v6.4s, #1<<7, lsl #24
ld1 {v0.2s}, [x0], #8
st1 {v0.2s}, [x2], #8
.rept 7
ld1 {v1.4S}, [x1], x3
ld1 {v2.4S}, [x0], #16
eor v1.16B, v1.16B, v6.16B
rev64 v1.4S, v1.4S
ext v1.16B, v1.16B, v1.16B, #8
st2 {v1.4S, v2.4S}, [x2], #32
ld1 {v1.4s}, [x1], x3
ld1 {v2.4s}, [x0], #16
eor v1.16b, v1.16b, v6.16b
rev64 v1.4s, v1.4s
ext v1.16b, v1.16b, v1.16b, #8
st2 {v1.4s, v2.4s}, [x2], #32
.endr
add x1, x1, #8
ld1 {v1.2S}, [x1], x4
ld1 {v2.2S}, [x0], #8
ld1 {v1.S}[3], [x1]
ld1 {v2.S}[2], [x0]
eor v1.16B, v1.16B, v6.16B
rev64 v1.4S, v1.4S
st2 {v1.2S, v2.2S}, [x2], #16
st2 {v1.S, v2.S}[2], [x2]
ld1 {v1.2s}, [x1], x4
ld1 {v2.2s}, [x0], #8
ld1 {v1.s}[3], [x1]
ld1 {v2.s}[2], [x0]
eor v1.16b, v1.16b, v6.16b
rev64 v1.4s, v1.4s
st2 {v1.2s, v2.2s}, [x2], #16
st2 {v1.s, v2.s}[2], [x2]
ret
endfunc
@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
add x2, x1, #60*4
mov x3, #-16
mov x4, #32
movi v6.4S, #1<<7, lsl #24
1: ld1 {v0.4S}, [x2], x3
ld1 {v1.4S}, [x1], #16
eor v0.16B, v0.16B, v6.16B
rev64 v0.4S, v0.4S
ext v0.16B, v0.16B, v0.16B, #8
st2 {v0.4S, v1.4S}, [x0], #32
movi v6.4s, #1<<7, lsl #24
1: ld1 {v0.4s}, [x2], x3
ld1 {v1.4s}, [x1], #16
eor v0.16b, v0.16b, v6.16b
rev64 v0.4s, v0.4s
ext v0.16b, v0.16b, v0.16b, #8
st2 {v0.4s, v1.4s}, [x0], #32
subs x4, x4, #4
b.gt 1b
ret
@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
add x2, x0, #60*4
mov x3, #-32
mov x4, #32
movi v2.4S, #1<<7, lsl #24
1: ld2 {v0.4S, v1.4S}, [x1], x3
eor v0.16B, v0.16B, v2.16B
rev64 v1.4S, v1.4S
ext v1.16B, v1.16B, v1.16B, #8
st1 {v0.4S}, [x2]
st1 {v1.4S}, [x0], #16
movi v2.4s, #1<<7, lsl #24
1: ld2 {v0.4s, v1.4s}, [x1], x3
eor v0.16b, v0.16b, v2.16b
rev64 v1.4s, v1.4s
ext v1.16b, v1.16b, v1.16b, #8
st1 {v0.4s}, [x2]
st1 {v1.4s}, [x0], #16
sub x2, x2, #16
subs x4, x4, #4
b.gt 1b
@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
add x3, x0, #124*4
mov x4, #64
mov x5, #-16
1: ld1 {v0.4S}, [x1], #16
ld1 {v1.4S}, [x2], x5
rev64 v2.4S, v0.4S
ext v2.16B, v2.16B, v2.16B, #8
rev64 v3.4S, v1.4S
ext v3.16B, v3.16B, v3.16B, #8
fadd v1.4S, v1.4S, v2.4S
fsub v0.4S, v0.4S, v3.4S
st1 {v0.4S}, [x0], #16
st1 {v1.4S}, [x3], x5
1: ld1 {v0.4s}, [x1], #16
ld1 {v1.4s}, [x2], x5
rev64 v2.4s, v0.4s
ext v2.16b, v2.16b, v2.16b, #8
rev64 v3.4s, v1.4s
ext v3.16b, v3.16b, v3.16b, #8
fadd v1.4s, v1.4s, v2.4s
fsub v0.4s, v0.4s, v3.4s
st1 {v0.4s}, [x0], #16
st1 {v1.4s}, [x3], x5
subs x4, x4, #4
b.gt 1b
ret
@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
sxtw x4, w4
sxtw x5, w5
movrel x6, factors
ld1 {v7.4S}, [x6]
dup v1.4S, v0.S[0]
mov v2.8B, v1.8B
mov v2.S[2], v7.S[0]
mov v2.S[3], v7.S[0]
fmul v1.4S, v1.4S, v2.4S
ld1 {v0.D}[0], [x3]
ld1 {v0.D}[1], [x2]
fmul v0.4S, v0.4S, v1.4S
fmul v1.4S, v0.4S, v7.4S
rev64 v0.4S, v0.4S
ld1 {v7.4s}, [x6]
dup v1.4s, v0.s[0]
mov v2.8b, v1.8b
mov v2.s[2], v7.s[0]
mov v2.s[3], v7.s[0]
fmul v1.4s, v1.4s, v2.4s
ld1 {v0.d}[0], [x3]
ld1 {v0.d}[1], [x2]
fmul v0.4s, v0.4s, v1.4s
fmul v1.4s, v0.4s, v7.4s
rev64 v0.4s, v0.4s
sub x7, x5, x4
add x0, x0, x4, lsl #3
add x1, x1, x4, lsl #3
sub x1, x1, #16
1: ld1 {v2.4S}, [x1], #16
ld1 {v3.2S}, [x1]
fmul v4.4S, v2.4S, v1.4S
fmul v5.4S, v2.4S, v0.4S
faddp v4.4S, v4.4S, v4.4S
faddp v5.4S, v5.4S, v5.4S
faddp v4.4S, v4.4S, v4.4S
faddp v5.4S, v5.4S, v5.4S
mov v4.S[1], v5.S[0]
fadd v4.2S, v4.2S, v3.2S
st1 {v4.2S}, [x0], #8
1: ld1 {v2.4s}, [x1], #16
ld1 {v3.2s}, [x1]
fmul v4.4s, v2.4s, v1.4s
fmul v5.4s, v2.4s, v0.4s
faddp v4.4s, v4.4s, v4.4s
faddp v5.4s, v5.4s, v5.4s
faddp v4.4s, v4.4s, v4.4s
faddp v5.4s, v5.4s, v5.4s
mov v4.s[1], v5.s[0]
fadd v4.2s, v4.2s, v3.2s
st1 {v4.2s}, [x0], #8
sub x1, x1, #8
subs x7, x7, #1
b.gt 1b
@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
sxtw x4, w4
mov x5, #40*2*4
add x1, x1, x4, lsl #3
1: ld1 {v0.2S}, [x1], x5
ld1 {v1.S}[0], [x2], #4
fmul v2.4S, v0.4S, v1.S[0]
st1 {v2.2S}, [x0], #8
1: ld1 {v0.2s}, [x1], x5
ld1 {v1.s}[0], [x2], #4
fmul v2.4s, v0.4s, v1.s[0]
st1 {v2.2s}, [x0], #8
subs x3, x3, #1
b.gt 1b
ret
@ -227,46 +227,46 @@ endfunc
function ff_sbr_autocorrelate_neon, export=1
mov x2, #38
movrel x3, factors
ld1 {v0.4S}, [x3]
movi v1.4S, #0
movi v2.4S, #0
movi v3.4S, #0
ld1 {v4.2S}, [x0], #8
ld1 {v5.2S}, [x0], #8
fmul v16.2S, v4.2S, v4.2S
fmul v17.2S, v5.2S, v4.S[0]
fmul v18.2S, v5.2S, v4.S[1]
1: ld1 {v5.D}[1], [x0], #8
fmla v1.2S, v4.2S, v4.2S
fmla v2.4S, v5.4S, v4.S[0]
fmla v3.4S, v5.4S, v4.S[1]
mov v4.D[0], v5.D[0]
mov v5.D[0], v5.D[1]
ld1 {v0.4s}, [x3]
movi v1.4s, #0
movi v2.4s, #0
movi v3.4s, #0
ld1 {v4.2s}, [x0], #8
ld1 {v5.2s}, [x0], #8
fmul v16.2s, v4.2s, v4.2s
fmul v17.2s, v5.2s, v4.s[0]
fmul v18.2s, v5.2s, v4.s[1]
1: ld1 {v5.d}[1], [x0], #8
fmla v1.2s, v4.2s, v4.2s
fmla v2.4s, v5.4s, v4.s[0]
fmla v3.4s, v5.4s, v4.s[1]
mov v4.d[0], v5.d[0]
mov v5.d[0], v5.d[1]
subs x2, x2, #1
b.gt 1b
fmul v19.2S, v4.2S, v4.2S
fmul v20.2S, v5.2S, v4.S[0]
fmul v21.2S, v5.2S, v4.S[1]
fadd v22.4S, v2.4S, v20.4S
fsub v22.4S, v22.4S, v17.4S
fadd v23.4S, v3.4S, v21.4S
fsub v23.4S, v23.4S, v18.4S
rev64 v23.4S, v23.4S
fmul v23.4S, v23.4S, v0.4S
fadd v22.4S, v22.4S, v23.4S
st1 {v22.4S}, [x1], #16
fadd v23.2S, v1.2S, v19.2S
fsub v23.2S, v23.2S, v16.2S
faddp v23.2S, v23.2S, v23.2S
st1 {v23.S}[0], [x1]
fmul v19.2s, v4.2s, v4.2s
fmul v20.2s, v5.2s, v4.s[0]
fmul v21.2s, v5.2s, v4.s[1]
fadd v22.4s, v2.4s, v20.4s
fsub v22.4s, v22.4s, v17.4s
fadd v23.4s, v3.4s, v21.4s
fsub v23.4s, v23.4s, v18.4s
rev64 v23.4s, v23.4s
fmul v23.4s, v23.4s, v0.4s
fadd v22.4s, v22.4s, v23.4s
st1 {v22.4s}, [x1], #16
fadd v23.2s, v1.2s, v19.2s
fsub v23.2s, v23.2s, v16.2s
faddp v23.2s, v23.2s, v23.2s
st1 {v23.s}[0], [x1]
add x1, x1, #8
rev64 v3.2S, v3.2S
fmul v3.2S, v3.2S, v0.2S
fadd v2.2S, v2.2S, v3.2S
st1 {v2.2S}, [x1]
rev64 v3.2s, v3.2s
fmul v3.2s, v3.2s, v0.2s
fadd v2.2s, v2.2s, v3.2s
st1 {v2.2s}, [x1]
add x1, x1, #16
faddp v1.2S, v1.2S, v1.2S
st1 {v1.S}[0], [x1]
faddp v1.2s, v1.2s, v1.2s
st1 {v1.s}[0], [x1]
ret
endfunc
@ -278,25 +278,25 @@ endfunc
1: and x3, x3, #0x1ff
add x8, x7, x3, lsl #3
add x3, x3, #2
ld1 {v2.4S}, [x0]
ld1 {v3.2S}, [x1], #8
ld1 {v4.2S}, [x2], #8
ld1 {v5.4S}, [x8]
mov v6.16B, v2.16B
zip1 v3.4S, v3.4S, v3.4S
zip1 v4.4S, v4.4S, v4.4S
fmla v6.4S, v1.4S, v3.4S
fmla v2.4S, v5.4S, v4.4S
fcmeq v7.4S, v3.4S, #0
bif v2.16B, v6.16B, v7.16B
st1 {v2.4S}, [x0], #16
ld1 {v2.4s}, [x0]
ld1 {v3.2s}, [x1], #8
ld1 {v4.2s}, [x2], #8
ld1 {v5.4s}, [x8]
mov v6.16b, v2.16b
zip1 v3.4s, v3.4s, v3.4s
zip1 v4.4s, v4.4s, v4.4s
fmla v6.4s, v1.4s, v3.4s
fmla v2.4s, v5.4s, v4.4s
fcmeq v7.4s, v3.4s, #0
bif v2.16b, v6.16b, v7.16b
st1 {v2.4s}, [x0], #16
subs x5, x5, #2
b.gt 1b
.endm
function ff_sbr_hf_apply_noise_0_neon, export=1
movrel x9, phi_noise_0
ld1 {v1.4S}, [x9]
ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
movrel x9, phi_noise_1
and x4, x4, #1
add x9, x9, x4, lsl #4
ld1 {v1.4S}, [x9]
ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
function ff_sbr_hf_apply_noise_2_neon, export=1
movrel x9, phi_noise_2
ld1 {v1.4S}, [x9]
ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc
@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
movrel x9, phi_noise_3
and x4, x4, #1
add x9, x9, x4, lsl #4
ld1 {v1.4S}, [x9]
ld1 {v1.4s}, [x9]
apply_noise_common
ret
endfunc

View File

@ -54,7 +54,7 @@ endconst
prfm pldl1keep, [\data]
mov x10, x30
movrel x3, idct_coeff_neon
ld1 {v0.2D}, [x3]
ld1 {v0.2d}, [x3]
.endm
.macro idct_end
@ -74,146 +74,146 @@ endconst
.endm
.macro idct_col4_top y1, y2, y3, y4, i, l
smull\i v7.4S, \y3\l, z2
smull\i v16.4S, \y3\l, z6
smull\i v17.4S, \y2\l, z1
add v19.4S, v23.4S, v7.4S
smull\i v18.4S, \y2\l, z3
add v20.4S, v23.4S, v16.4S
smull\i v5.4S, \y2\l, z5
sub v21.4S, v23.4S, v16.4S
smull\i v6.4S, \y2\l, z7
sub v22.4S, v23.4S, v7.4S
smull\i v7.4s, \y3\l, z2
smull\i v16.4s, \y3\l, z6
smull\i v17.4s, \y2\l, z1
add v19.4s, v23.4s, v7.4s
smull\i v18.4s, \y2\l, z3
add v20.4s, v23.4s, v16.4s
smull\i v5.4s, \y2\l, z5
sub v21.4s, v23.4s, v16.4s
smull\i v6.4s, \y2\l, z7
sub v22.4s, v23.4s, v7.4s
smlal\i v17.4S, \y4\l, z3
smlsl\i v18.4S, \y4\l, z7
smlsl\i v5.4S, \y4\l, z1
smlsl\i v6.4S, \y4\l, z5
smlal\i v17.4s, \y4\l, z3
smlsl\i v18.4s, \y4\l, z7
smlsl\i v5.4s, \y4\l, z1
smlsl\i v6.4s, \y4\l, z5
.endm
.macro idct_row4_neon y1, y2, y3, y4, pass
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
movi v23.4S, #1<<2, lsl #8
orr v5.16B, \y1\().16B, \y2\().16B
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
orr v6.16B, \y3\().16B, \y4\().16B
orr v5.16B, v5.16B, v6.16B
mov x3, v5.D[1]
smlal v23.4S, \y1\().4H, z4
ld1 {\y1\().2d,\y2\().2d}, [x2], #32
movi v23.4s, #1<<2, lsl #8
orr v5.16b, \y1\().16b, \y2\().16b
ld1 {\y3\().2d,\y4\().2d}, [x2], #32
orr v6.16b, \y3\().16b, \y4\().16b
orr v5.16b, v5.16b, v6.16b
mov x3, v5.d[1]
smlal v23.4s, \y1\().4h, z4
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
idct_col4_top \y1, \y2, \y3, \y4, 1, .4h
cmp x3, #0
b.eq \pass\()f
smull2 v7.4S, \y1\().8H, z4
smlal2 v17.4S, \y2\().8H, z5
smlsl2 v18.4S, \y2\().8H, z1
smull2 v16.4S, \y3\().8H, z2
smlal2 v5.4S, \y2\().8H, z7
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v7.4S
sub v21.4S, v21.4S, v7.4S
add v22.4S, v22.4S, v7.4S
smlal2 v6.4S, \y2\().8H, z3
smull2 v7.4S, \y3\().8H, z6
smlal2 v17.4S, \y4\().8H, z7
smlsl2 v18.4S, \y4\().8H, z5
smlal2 v5.4S, \y4\().8H, z3
smlsl2 v6.4S, \y4\().8H, z1
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v16.4S
add v21.4S, v21.4S, v16.4S
sub v22.4S, v22.4S, v7.4S
smull2 v7.4s, \y1\().8h, z4
smlal2 v17.4s, \y2\().8h, z5
smlsl2 v18.4s, \y2\().8h, z1
smull2 v16.4s, \y3\().8h, z2
smlal2 v5.4s, \y2\().8h, z7
add v19.4s, v19.4s, v7.4s
sub v20.4s, v20.4s, v7.4s
sub v21.4s, v21.4s, v7.4s
add v22.4s, v22.4s, v7.4s
smlal2 v6.4s, \y2\().8h, z3
smull2 v7.4s, \y3\().8h, z6
smlal2 v17.4s, \y4\().8h, z7
smlsl2 v18.4s, \y4\().8h, z5
smlal2 v5.4s, \y4\().8h, z3
smlsl2 v6.4s, \y4\().8h, z1
add v19.4s, v19.4s, v7.4s
sub v20.4s, v20.4s, v16.4s
add v21.4s, v21.4s, v16.4s
sub v22.4s, v22.4s, v7.4s
\pass: add \y3\().4S, v19.4S, v17.4S
add \y4\().4S, v20.4S, v18.4S
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
add v7.4S, v21.4S, v5.4S
add v16.4S, v22.4S, v6.4S
shrn \y3\().4H, v7.4S, #ROW_SHIFT
shrn \y4\().4H, v16.4S, #ROW_SHIFT
sub v22.4S, v22.4S, v6.4S
sub v19.4S, v19.4S, v17.4S
sub v21.4S, v21.4S, v5.4S
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
sub v20.4S, v20.4S, v18.4S
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
add \y4\().4s, v20.4s, v18.4s
shrn \y1\().4h, \y3\().4s, #ROW_SHIFT
shrn \y2\().4h, \y4\().4s, #ROW_SHIFT
add v7.4s, v21.4s, v5.4s
add v16.4s, v22.4s, v6.4s
shrn \y3\().4h, v7.4s, #ROW_SHIFT
shrn \y4\().4h, v16.4s, #ROW_SHIFT
sub v22.4s, v22.4s, v6.4s
sub v19.4s, v19.4s, v17.4s
sub v21.4s, v21.4s, v5.4s
shrn2 \y1\().8h, v22.4s, #ROW_SHIFT
sub v20.4s, v20.4s, v18.4s
shrn2 \y2\().8h, v21.4s, #ROW_SHIFT
shrn2 \y3\().8h, v20.4s, #ROW_SHIFT
shrn2 \y4\().8h, v19.4s, #ROW_SHIFT
trn1 v16.8H, \y1\().8H, \y2\().8H
trn2 v17.8H, \y1\().8H, \y2\().8H
trn1 v18.8H, \y3\().8H, \y4\().8H
trn2 v19.8H, \y3\().8H, \y4\().8H
trn1 \y1\().4S, v16.4S, v18.4S
trn1 \y2\().4S, v17.4S, v19.4S
trn2 \y3\().4S, v16.4S, v18.4S
trn2 \y4\().4S, v17.4S, v19.4S
trn1 v16.8h, \y1\().8h, \y2\().8h
trn2 v17.8h, \y1\().8h, \y2\().8h
trn1 v18.8h, \y3\().8h, \y4\().8h
trn2 v19.8h, \y3\().8h, \y4\().8h
trn1 \y1\().4s, v16.4s, v18.4s
trn1 \y2\().4s, v17.4s, v19.4s
trn2 \y3\().4s, v16.4s, v18.4s
trn2 \y4\().4s, v17.4s, v19.4s
.endm
.macro declare_idct_col4_neon i, l
function idct_col4_neon\i
dup v23.4H, z4c
dup v23.4h, z4c
.if \i == 1
add v23.4H, v23.4H, v24.4H
add v23.4h, v23.4h, v24.4h
.else
mov v5.D[0], v24.D[1]
add v23.4H, v23.4H, v5.4H
mov v5.d[0], v24.d[1]
add v23.4h, v23.4h, v5.4h
.endif
smull v23.4S, v23.4H, z4
smull v23.4s, v23.4h, z4
idct_col4_top v24, v25, v26, v27, \i, \l
mov x4, v28.D[\i - 1]
mov x5, v29.D[\i - 1]
mov x4, v28.d[\i - 1]
mov x5, v29.d[\i - 1]
cmp x4, #0
b.eq 1f
smull\i v7.4S, v28\l, z4
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v7.4S
sub v21.4S, v21.4S, v7.4S
add v22.4S, v22.4S, v7.4S
smull\i v7.4s, v28\l, z4
add v19.4s, v19.4s, v7.4s
sub v20.4s, v20.4s, v7.4s
sub v21.4s, v21.4s, v7.4s
add v22.4s, v22.4s, v7.4s
1: mov x4, v30.D[\i - 1]
1: mov x4, v30.d[\i - 1]
cmp x5, #0
b.eq 2f
smlal\i v17.4S, v29\l, z5
smlsl\i v18.4S, v29\l, z1
smlal\i v5.4S, v29\l, z7
smlal\i v6.4S, v29\l, z3
smlal\i v17.4s, v29\l, z5
smlsl\i v18.4s, v29\l, z1
smlal\i v5.4s, v29\l, z7
smlal\i v6.4s, v29\l, z3
2: mov x5, v31.D[\i - 1]
2: mov x5, v31.d[\i - 1]
cmp x4, #0
b.eq 3f
smull\i v7.4S, v30\l, z6
smull\i v16.4S, v30\l, z2
add v19.4S, v19.4S, v7.4S
sub v22.4S, v22.4S, v7.4S
sub v20.4S, v20.4S, v16.4S
add v21.4S, v21.4S, v16.4S
smull\i v7.4s, v30\l, z6
smull\i v16.4s, v30\l, z2
add v19.4s, v19.4s, v7.4s
sub v22.4s, v22.4s, v7.4s
sub v20.4s, v20.4s, v16.4s
add v21.4s, v21.4s, v16.4s
3: cmp x5, #0
b.eq 4f
smlal\i v17.4S, v31\l, z7
smlsl\i v18.4S, v31\l, z5
smlal\i v5.4S, v31\l, z3
smlsl\i v6.4S, v31\l, z1
smlal\i v17.4s, v31\l, z7
smlsl\i v18.4s, v31\l, z5
smlal\i v5.4s, v31\l, z3
smlsl\i v6.4s, v31\l, z1
4: addhn v7.4H, v19.4S, v17.4S
addhn2 v7.8H, v20.4S, v18.4S
subhn v18.4H, v20.4S, v18.4S
subhn2 v18.8H, v19.4S, v17.4S
4: addhn v7.4h, v19.4s, v17.4s
addhn2 v7.8h, v20.4s, v18.4s
subhn v18.4h, v20.4s, v18.4s
subhn2 v18.8h, v19.4s, v17.4s
addhn v16.4H, v21.4S, v5.4S
addhn2 v16.8H, v22.4S, v6.4S
subhn v17.4H, v22.4S, v6.4S
subhn2 v17.8H, v21.4S, v5.4S
addhn v16.4h, v21.4s, v5.4s
addhn2 v16.8h, v22.4s, v6.4s
subhn v17.4h, v22.4s, v6.4s
subhn2 v17.8h, v21.4s, v5.4s
ret
endfunc
@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
sqshrun v1.8b, v7.8h, #COL_SHIFT-16
sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16
sqshrun v3.8b, v17.8h, #COL_SHIFT-16
sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
sqshrun v2.8b, v7.8h, #COL_SHIFT-16
sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16
sqshrun v4.8b, v17.8h, #COL_SHIFT-16
sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16
zip1 v16.4S, v1.4S, v2.4S
zip2 v17.4S, v1.4S, v2.4S
zip1 v16.4s, v1.4s, v2.4s
zip2 v17.4s, v1.4s, v2.4s
st1 {v16.D}[0], [x0], x1
st1 {v16.D}[1], [x0], x1
st1 {v16.d}[0], [x0], x1
st1 {v16.d}[1], [x0], x1
zip1 v18.4S, v3.4S, v4.4S
zip2 v19.4S, v3.4S, v4.4S
zip1 v18.4s, v3.4s, v4.4s
zip2 v19.4s, v3.4s, v4.4s
st1 {v17.D}[0], [x0], x1
st1 {v17.D}[1], [x0], x1
st1 {v18.D}[0], [x0], x1
st1 {v18.D}[1], [x0], x1
st1 {v19.D}[0], [x0], x1
st1 {v19.D}[1], [x0], x1
st1 {v17.d}[0], [x0], x1
st1 {v17.d}[1], [x0], x1
st1 {v18.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
st1 {v19.d}[0], [x0], x1
st1 {v19.d}[1], [x0], x1
idct_end
endfunc
@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16
sshr v1.8h, v7.8h, #COL_SHIFT-16
sshr v2.8h, v16.8h, #COL_SHIFT-16
sshr v3.8h, v17.8h, #COL_SHIFT-16
sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16
sshr v7.8h, v7.8h, #COL_SHIFT-16
sshr v16.8h, v16.8h, #COL_SHIFT-16
sshr v17.8h, v17.8h, #COL_SHIFT-16
sshr v18.8h, v18.8h, #COL_SHIFT-16
mov x9, x0
ld1 {v19.D}[0], [x0], x1
zip1 v23.2D, v1.2D, v7.2D
zip2 v24.2D, v1.2D, v7.2D
ld1 {v19.D}[1], [x0], x1
zip1 v25.2D, v2.2D, v16.2D
zip2 v26.2D, v2.2D, v16.2D
ld1 {v20.D}[0], [x0], x1
zip1 v27.2D, v3.2D, v17.2D
zip2 v28.2D, v3.2D, v17.2D
ld1 {v20.D}[1], [x0], x1
zip1 v29.2D, v4.2D, v18.2D
zip2 v30.2D, v4.2D, v18.2D
ld1 {v21.D}[0], [x0], x1
uaddw v23.8H, v23.8H, v19.8B
uaddw2 v24.8H, v24.8H, v19.16B
ld1 {v21.D}[1], [x0], x1
sqxtun v23.8B, v23.8H
sqxtun2 v23.16B, v24.8H
ld1 {v22.D}[0], [x0], x1
uaddw v24.8H, v25.8H, v20.8B
uaddw2 v25.8H, v26.8H, v20.16B
ld1 {v22.D}[1], [x0], x1
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v25.8H
st1 {v23.D}[0], [x9], x1
uaddw v25.8H, v27.8H, v21.8B
uaddw2 v26.8H, v28.8H, v21.16B
st1 {v23.D}[1], [x9], x1
sqxtun v25.8B, v25.8H
sqxtun2 v25.16B, v26.8H
st1 {v24.D}[0], [x9], x1
uaddw v26.8H, v29.8H, v22.8B
uaddw2 v27.8H, v30.8H, v22.16B
st1 {v24.D}[1], [x9], x1
sqxtun v26.8B, v26.8H
sqxtun2 v26.16B, v27.8H
st1 {v25.D}[0], [x9], x1
st1 {v25.D}[1], [x9], x1
st1 {v26.D}[0], [x9], x1
st1 {v26.D}[1], [x9], x1
ld1 {v19.d}[0], [x0], x1
zip1 v23.2d, v1.2d, v7.2d
zip2 v24.2d, v1.2d, v7.2d
ld1 {v19.d}[1], [x0], x1
zip1 v25.2d, v2.2d, v16.2d
zip2 v26.2d, v2.2d, v16.2d
ld1 {v20.d}[0], [x0], x1
zip1 v27.2d, v3.2d, v17.2d
zip2 v28.2d, v3.2d, v17.2d
ld1 {v20.d}[1], [x0], x1
zip1 v29.2d, v4.2d, v18.2d
zip2 v30.2d, v4.2d, v18.2d
ld1 {v21.d}[0], [x0], x1
uaddw v23.8h, v23.8h, v19.8b
uaddw2 v24.8h, v24.8h, v19.16b
ld1 {v21.d}[1], [x0], x1
sqxtun v23.8b, v23.8h
sqxtun2 v23.16b, v24.8h
ld1 {v22.d}[0], [x0], x1
uaddw v24.8h, v25.8h, v20.8b
uaddw2 v25.8h, v26.8h, v20.16b
ld1 {v22.d}[1], [x0], x1
sqxtun v24.8b, v24.8h
sqxtun2 v24.16b, v25.8h
st1 {v23.d}[0], [x9], x1
uaddw v25.8h, v27.8h, v21.8b
uaddw2 v26.8h, v28.8h, v21.16b
st1 {v23.d}[1], [x9], x1
sqxtun v25.8b, v25.8h
sqxtun2 v25.16b, v26.8h
st1 {v24.d}[0], [x9], x1
uaddw v26.8h, v29.8h, v22.8b
uaddw2 v27.8h, v30.8h, v22.16b
st1 {v24.d}[1], [x9], x1
sqxtun v26.8b, v26.8h
sqxtun2 v26.16b, v27.8h
st1 {v25.d}[0], [x9], x1
st1 {v25.d}[1], [x9], x1
st1 {v26.d}[0], [x9], x1
st1 {v26.d}[1], [x9], x1
idct_end
endfunc
@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
sub x2, x2, #128
bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16
sshr v1.8h, v7.8h, #COL_SHIFT-16
sshr v2.8h, v16.8h, #COL_SHIFT-16
sshr v3.8h, v17.8h, #COL_SHIFT-16
sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16
sshr v7.8h, v7.8h, #COL_SHIFT-16
sshr v16.8h, v16.8h, #COL_SHIFT-16
sshr v17.8h, v17.8h, #COL_SHIFT-16
sshr v18.8h, v18.8h, #COL_SHIFT-16
zip1 v23.2D, v1.2D, v7.2D
zip2 v24.2D, v1.2D, v7.2D
st1 {v23.2D,v24.2D}, [x2], #32
zip1 v25.2D, v2.2D, v16.2D
zip2 v26.2D, v2.2D, v16.2D
st1 {v25.2D,v26.2D}, [x2], #32
zip1 v27.2D, v3.2D, v17.2D
zip2 v28.2D, v3.2D, v17.2D
st1 {v27.2D,v28.2D}, [x2], #32
zip1 v29.2D, v4.2D, v18.2D
zip2 v30.2D, v4.2D, v18.2D
st1 {v29.2D,v30.2D}, [x2], #32
zip1 v23.2d, v1.2d, v7.2d
zip2 v24.2d, v1.2d, v7.2d
st1 {v23.2d,v24.2d}, [x2], #32
zip1 v25.2d, v2.2d, v16.2d
zip2 v26.2d, v2.2d, v16.2d
st1 {v25.2d,v26.2d}, [x2], #32
zip1 v27.2d, v3.2d, v17.2d
zip2 v28.2d, v3.2d, v17.2d
st1 {v27.2d,v28.2d}, [x2], #32
zip1 v29.2d, v4.2d, v18.2d
zip2 v30.2d, v4.2d, v18.2d
st1 {v29.2d,v30.2d}, [x2], #32
idct_end
endfunc

View File

@ -22,19 +22,19 @@
// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
.macro acc_sum_store x, xb
dup v24.4S, v24.S[3] // ...X -> XXXX
ext v25.16B, v26.16B, \xb, #12 // ext(0000,ABCD,12)=0ABC
add v24.4S, v24.4S, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,0ABC,12)=00AB
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,00AB,12)=000A
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
st1 {v24.4S}, [x0], #16 // write 4x32-bit final values
dup v24.4s, v24.s[3] // ...X -> XXXX
ext v25.16b, v26.16b, \xb, #12 // ext(0000,ABCD,12)=0ABC
add v24.4s, v24.4s, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,0ABC,12)=00AB
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,00AB,12)=000A
add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
st1 {v24.4s}, [x0], #16 // write 4x32-bit final values
.endm
function ff_compute_safe_ssd_integral_image_neon, export=1
movi v26.4S, #0 // used as zero for the "rotations" in acc_sum_store
movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w)
sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w)
sub x9, x0, w1, UXTW #2 // dst_top
@ -43,31 +43,31 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
1: mov w10, w6 // width copy for each line
sub x0, x0, #16 // beginning of the dst line minus 4 sums
sub x8, x9, #4 // dst_top-1
ld1 {v24.4S}, [x0], #16 // load ...X (contextual last sums)
2: ld1 {v0.16B}, [x2], #16 // s1[x + 0..15]
ld1 {v1.16B}, [x4], #16 // s2[x + 0..15]
ld1 {v16.4S,v17.4S}, [x8], #32 // dst_top[x + 0..7 - 1]
usubl v2.8H, v0.8B, v1.8B // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
usubl2 v3.8H, v0.16B, v1.16B // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
ld1 {v18.4S,v19.4S}, [x8], #32 // dst_top[x + 8..15 - 1]
smull v4.4S, v2.4H, v2.4H // d[x + 0..3]^2
smull2 v5.4S, v2.8H, v2.8H // d[x + 4..7]^2
ld1 {v20.4S,v21.4S}, [x9], #32 // dst_top[x + 0..7]
smull v6.4S, v3.4H, v3.4H // d[x + 8..11]^2
smull2 v7.4S, v3.8H, v3.8H // d[x + 12..15]^2
ld1 {v22.4S,v23.4S}, [x9], #32 // dst_top[x + 8..15]
sub v0.4S, v20.4S, v16.4S // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
sub v1.4S, v21.4S, v17.4S // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
add v0.4S, v0.4S, v4.4S // + d[x + 0..3]^2
add v1.4S, v1.4S, v5.4S // + d[x + 4..7]^2
sub v2.4S, v22.4S, v18.4S // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
sub v3.4S, v23.4S, v19.4S // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
add v2.4S, v2.4S, v6.4S // + d[x + 8..11]^2
add v3.4S, v3.4S, v7.4S // + d[x + 12..15]^2
acc_sum_store v0.4S, v0.16B // accumulate and store dst[ 0..3]
acc_sum_store v1.4S, v1.16B // accumulate and store dst[ 4..7]
acc_sum_store v2.4S, v2.16B // accumulate and store dst[ 8..11]
acc_sum_store v3.4S, v3.16B // accumulate and store dst[12..15]
ld1 {v24.4s}, [x0], #16 // load ...X (contextual last sums)
2: ld1 {v0.16b}, [x2], #16 // s1[x + 0..15]
ld1 {v1.16b}, [x4], #16 // s2[x + 0..15]
ld1 {v16.4s,v17.4s}, [x8], #32 // dst_top[x + 0..7 - 1]
usubl v2.8h, v0.8b, v1.8b // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
usubl2 v3.8h, v0.16b, v1.16b // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
ld1 {v18.4s,v19.4s}, [x8], #32 // dst_top[x + 8..15 - 1]
smull v4.4s, v2.4h, v2.4h // d[x + 0..3]^2
smull2 v5.4s, v2.8h, v2.8h // d[x + 4..7]^2
ld1 {v20.4s,v21.4s}, [x9], #32 // dst_top[x + 0..7]
smull v6.4s, v3.4h, v3.4h // d[x + 8..11]^2
smull2 v7.4s, v3.8h, v3.8h // d[x + 12..15]^2
ld1 {v22.4s,v23.4s}, [x9], #32 // dst_top[x + 8..15]
sub v0.4s, v20.4s, v16.4s // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
sub v1.4s, v21.4s, v17.4s // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
add v0.4s, v0.4s, v4.4s // + d[x + 0..3]^2
add v1.4s, v1.4s, v5.4s // + d[x + 4..7]^2
sub v2.4s, v22.4s, v18.4s // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
sub v3.4s, v23.4s, v19.4s // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
add v2.4s, v2.4s, v6.4s // + d[x + 8..11]^2
add v3.4s, v3.4s, v7.4s // + d[x + 12..15]^2
acc_sum_store v0.4s, v0.16b // accumulate and store dst[ 0..3]
acc_sum_store v1.4s, v1.16b // accumulate and store dst[ 4..7]
acc_sum_store v2.4s, v2.16b // accumulate and store dst[ 8..11]
acc_sum_store v3.4s, v3.16b // accumulate and store dst[12..15]
subs w10, w10, #16 // width dec
b.ne 2b // loop til next line
add x2, x2, x3 // skip to next line (s1)

View File

@ -25,16 +25,16 @@
function ff_vector_fmul_neon, export=1
1: subs w3, w3, #16
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x1], #32
ld1 {v4.4S, v5.4S}, [x2], #32
ld1 {v6.4S, v7.4S}, [x2], #32
fmul v16.4S, v0.4S, v4.4S
fmul v17.4S, v1.4S, v5.4S
fmul v18.4S, v2.4S, v6.4S
fmul v19.4S, v3.4S, v7.4S
st1 {v16.4S, v17.4S}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32
ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4s, v3.4s}, [x1], #32
ld1 {v4.4s, v5.4s}, [x2], #32
ld1 {v6.4s, v7.4s}, [x2], #32
fmul v16.4s, v0.4s, v4.4s
fmul v17.4s, v1.4s, v5.4s
fmul v18.4s, v2.4s, v6.4s
fmul v19.4s, v3.4s, v7.4s
st1 {v16.4s, v17.4s}, [x0], #32
st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b
ret
endfunc
@ -42,16 +42,16 @@ endfunc
function ff_vector_fmac_scalar_neon, export=1
mov x3, #-32
1: subs w2, w2, #16
ld1 {v16.4S, v17.4S}, [x0], #32
ld1 {v18.4S, v19.4S}, [x0], x3
ld1 {v4.4S, v5.4S}, [x1], #32
ld1 {v6.4S, v7.4S}, [x1], #32
fmla v16.4S, v4.4S, v0.S[0]
fmla v17.4S, v5.4S, v0.S[0]
fmla v18.4S, v6.4S, v0.S[0]
fmla v19.4S, v7.4S, v0.S[0]
st1 {v16.4S, v17.4S}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32
ld1 {v16.4s, v17.4s}, [x0], #32
ld1 {v18.4s, v19.4s}, [x0], x3
ld1 {v4.4s, v5.4s}, [x1], #32
ld1 {v6.4s, v7.4s}, [x1], #32
fmla v16.4s, v4.4s, v0.s[0]
fmla v17.4s, v5.4s, v0.s[0]
fmla v18.4s, v6.4s, v0.s[0]
fmla v19.4s, v7.4s, v0.s[0]
st1 {v16.4s, v17.4s}, [x0], #32
st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b
ret
endfunc
@ -59,43 +59,43 @@ endfunc
function ff_vector_fmul_scalar_neon, export=1
mov w4, #15
bics w3, w2, w4
dup v16.4S, v0.S[0]
dup v16.4s, v0.s[0]
b.eq 3f
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v0.4s, v1.4s}, [x1], #32
1: subs w3, w3, #16
fmul v0.4S, v0.4S, v16.4S
ld1 {v2.4S, v3.4S}, [x1], #32
fmul v1.4S, v1.4S, v16.4S
fmul v2.4S, v2.4S, v16.4S
st1 {v0.4S, v1.4S}, [x0], #32
fmul v3.4S, v3.4S, v16.4S
fmul v0.4s, v0.4s, v16.4s
ld1 {v2.4s, v3.4s}, [x1], #32
fmul v1.4s, v1.4s, v16.4s
fmul v2.4s, v2.4s, v16.4s
st1 {v0.4s, v1.4s}, [x0], #32
fmul v3.4s, v3.4s, v16.4s
b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32
st1 {v2.4S, v3.4S}, [x0], #32
ld1 {v0.4s, v1.4s}, [x1], #32
st1 {v2.4s, v3.4s}, [x0], #32
b 1b
2: ands w2, w2, #15
st1 {v2.4S, v3.4S}, [x0], #32
st1 {v2.4s, v3.4s}, [x0], #32
b.eq 4f
3: ld1 {v0.4S}, [x1], #16
fmul v0.4S, v0.4S, v16.4S
st1 {v0.4S}, [x0], #16
3: ld1 {v0.4s}, [x1], #16
fmul v0.4s, v0.4s, v16.4s
st1 {v0.4s}, [x0], #16
subs w2, w2, #4
b.gt 3b
4: ret
endfunc
function ff_vector_dmul_scalar_neon, export=1
dup v16.2D, v0.D[0]
ld1 {v0.2D, v1.2D}, [x1], #32
dup v16.2d, v0.d[0]
ld1 {v0.2d, v1.2d}, [x1], #32
1: subs w2, w2, #8
fmul v0.2D, v0.2D, v16.2D
ld1 {v2.2D, v3.2D}, [x1], #32
fmul v1.2D, v1.2D, v16.2D
fmul v2.2D, v2.2D, v16.2D
st1 {v0.2D, v1.2D}, [x0], #32
fmul v3.2D, v3.2D, v16.2D
ld1 {v0.2D, v1.2D}, [x1], #32
st1 {v2.2D, v3.2D}, [x0], #32
fmul v0.2d, v0.2d, v16.2d
ld1 {v2.2d, v3.2d}, [x1], #32
fmul v1.2d, v1.2d, v16.2d
fmul v2.2d, v2.2d, v16.2d
st1 {v0.2d, v1.2d}, [x0], #32
fmul v3.2d, v3.2d, v16.2d
ld1 {v0.2d, v1.2d}, [x1], #32
st1 {v2.2d, v3.2d}, [x0], #32
b.gt 1b
ret
endfunc
@ -108,49 +108,49 @@ function ff_vector_fmul_window_neon, export=1
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
mov x7, #-16
ld1 {v0.4S}, [x1], #16 // s0
ld1 {v2.4S}, [x3], #16 // wi
ld1 {v1.4S}, [x2], x7 // s1
1: ld1 {v3.4S}, [x6], x7 // wj
ld1 {v0.4s}, [x1], #16 // s0
ld1 {v2.4s}, [x3], #16 // wi
ld1 {v1.4s}, [x2], x7 // s1
1: ld1 {v3.4s}, [x6], x7 // wj
subs x4, x4, #4
fmul v17.4S, v0.4S, v2.4S // s0 * wi
rev64 v4.4S, v1.4S
rev64 v5.4S, v3.4S
rev64 v17.4S, v17.4S
ext v4.16B, v4.16B, v4.16B, #8 // s1_r
ext v5.16B, v5.16B, v5.16B, #8 // wj_r
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
fmul v17.4s, v0.4s, v2.4s // s0 * wi
rev64 v4.4s, v1.4s
rev64 v5.4s, v3.4s
rev64 v17.4s, v17.4s
ext v4.16b, v4.16b, v4.16b, #8 // s1_r
ext v5.16b, v5.16b, v5.16b, #8 // wj_r
ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
fmul v16.4s, v0.4s, v5.4s // s0 * wj_r
fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj
b.eq 2f
ld1 {v0.4S}, [x1], #16
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7
ld1 {v2.4S}, [x3], #16
ld1 {v1.4S}, [x2], x7
st1 {v16.4S}, [x0], #16
ld1 {v0.4s}, [x1], #16
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
st1 {v17.4s}, [x5], x7
ld1 {v2.4s}, [x3], #16
ld1 {v1.4s}, [x2], x7
st1 {v16.4s}, [x0], #16
b 1b
2:
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7
st1 {v16.4S}, [x0], #16
fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
st1 {v17.4s}, [x5], x7
st1 {v16.4s}, [x0], #16
ret
endfunc
function ff_vector_fmul_add_neon, export=1
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32
ld1 {v4.4S, v5.4S}, [x3], #32
ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4s, v3.4s}, [x2], #32
ld1 {v4.4s, v5.4s}, [x3], #32
1: subs w4, w4, #8
fmla v4.4S, v0.4S, v2.4S
fmla v5.4S, v1.4S, v3.4S
fmla v4.4s, v0.4s, v2.4s
fmla v5.4s, v1.4s, v3.4s
b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32
st1 {v4.4S, v5.4S}, [x0], #32
ld1 {v4.4S, v5.4S}, [x3], #32
ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4s, v3.4s}, [x2], #32
st1 {v4.4s, v5.4s}, [x0], #32
ld1 {v4.4s, v5.4s}, [x3], #32
b 1b
2: st1 {v4.4S, v5.4S}, [x0], #32
2: st1 {v4.4s, v5.4s}, [x0], #32
ret
endfunc
@ -159,44 +159,44 @@ function ff_vector_fmul_reverse_neon, export=1
add x2, x2, x3, lsl #2
sub x2, x2, #32
mov x4, #-32
ld1 {v2.4S, v3.4S}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4s, v3.4s}, [x2], x4
ld1 {v0.4s, v1.4s}, [x1], #32
1: subs x3, x3, #8
rev64 v3.4S, v3.4S
rev64 v2.4S, v2.4S
ext v3.16B, v3.16B, v3.16B, #8
ext v2.16B, v2.16B, v2.16B, #8
fmul v16.4S, v0.4S, v3.4S
fmul v17.4S, v1.4S, v2.4S
rev64 v3.4s, v3.4s
rev64 v2.4s, v2.4s
ext v3.16b, v3.16b, v3.16b, #8
ext v2.16b, v2.16b, v2.16b, #8
fmul v16.4s, v0.4s, v3.4s
fmul v17.4s, v1.4s, v2.4s
b.eq 2f
ld1 {v2.4S, v3.4S}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32
st1 {v16.4S, v17.4S}, [x0], #32
ld1 {v2.4s, v3.4s}, [x2], x4
ld1 {v0.4s, v1.4s}, [x1], #32
st1 {v16.4s, v17.4s}, [x0], #32
b 1b
2: st1 {v16.4S, v17.4S}, [x0], #32
2: st1 {v16.4s, v17.4s}, [x0], #32
ret
endfunc
function ff_butterflies_float_neon, export=1
1: ld1 {v0.4S}, [x0]
ld1 {v1.4S}, [x1]
1: ld1 {v0.4s}, [x0]
ld1 {v1.4s}, [x1]
subs w2, w2, #4
fsub v2.4S, v0.4S, v1.4S
fadd v3.4S, v0.4S, v1.4S
st1 {v2.4S}, [x1], #16
st1 {v3.4S}, [x0], #16
fsub v2.4s, v0.4s, v1.4s
fadd v3.4s, v0.4s, v1.4s
st1 {v2.4s}, [x1], #16
st1 {v3.4s}, [x0], #16
b.gt 1b
ret
endfunc
function ff_scalarproduct_float_neon, export=1
movi v2.4S, #0
1: ld1 {v0.4S}, [x0], #16
ld1 {v1.4S}, [x1], #16
movi v2.4s, #0
1: ld1 {v0.4s}, [x0], #16
ld1 {v1.4s}, [x1], #16
subs w2, w2, #4
fmla v2.4S, v0.4S, v1.4S
fmla v2.4s, v0.4s, v1.4s
b.gt 1b
faddp v0.4S, v2.4S, v2.4S
faddp s0, v0.2S
faddp v0.4s, v2.4s, v2.4s
faddp s0, v0.2s
ret
endfunc

View File

@ -21,57 +21,57 @@
#include "libavutil/aarch64/asm.S"
function ff_resample_common_apply_filter_x4_float_neon, export=1
movi v0.4S, #0 // accumulator
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
ld1 {v2.4S}, [x2], #16 // filter[0..3]
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_float_neon, export=1
movi v0.4S, #0 // accumulator
1: ld1 {v1.4S}, [x1], #16 // src[0..3]
ld1 {v2.4S}, [x2], #16 // filter[0..3]
ld1 {v3.4S}, [x1], #16 // src[4..7]
ld1 {v4.4S}, [x2], #16 // filter[4..7]
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3]
fmla v0.4S, v3.4S, v4.4S // accumulator += src[4..7] * filter[4..7]
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x4_s16_neon, export=1
movi v0.4S, #0 // accumulator
1: ld1 {v1.4H}, [x1], #8 // src[0..3]
ld1 {v2.4H}, [x2], #8 // filter[0..3]
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
movi v0.4s, #0 // accumulator
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_s16_neon, export=1
movi v0.4S, #0 // accumulator
1: ld1 {v1.8H}, [x1], #16 // src[0..7]
ld1 {v2.8H}, [x2], #16 // filter[0..7]
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4S, v1.8H, v2.8H // accumulator += src[4..7] * filter[4..7]
movi v0.4s, #0 // accumulator
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc

View File

@ -50,43 +50,43 @@ function ff_hscale8to15_X8_neon, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H}, [x1], #8 // write to destination part0123
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc
@ -245,7 +245,7 @@ function ff_hscale8to15_4_neon, export=1
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
1:
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
// load 8 values from filterPos to be used as offsets into src
ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
@ -253,74 +253,74 @@ function ff_hscale8to15_4_neon, export=1
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
add x5, x5, #32 // advance filterPos
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
sub w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cmp w2, #16 // continue on main loop if there are at least 16 iterations left
b.ge 1b
// last full iteration
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp]
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7
ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
movi v0.2D, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7
movi v0.2d, #0 // Clear madd accumulator for idx 0..3
movi v5.2d, #0 // Clear madd accumulator for idx 4..7
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit
uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3
smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7
smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
subs w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7]
sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
@ -332,15 +332,15 @@ function ff_hscale8to15_4_neon, export=1
// load src
ldr w8, [x5], #4 // filterPos[i]
add x9, x3, w8, UXTW // calculate the address for src load
ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3]
ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
// load filter
ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3]
ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit
smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...]
addv s0, v0.4S // add up products of src and filter values
uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
addv s0, v0.4s // add up products of src and filter values
sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
st1 {v0.H}[0], [x1], #2 // dst[i] = ...
st1 {v0.h}[0], [x1], #2 // dst[i] = ...
sub w2, w2, #1 // dstW--
cbnz w2, 2b
@ -445,12 +445,12 @@ function ff_hscale8to19_4_neon, export=1
smull v5.4s, v0.4h, v28.4h
smull2 v6.4s, v0.8h, v28.8h
uxtl v2.8h, v2.8b
smlal v5.4s, v1.4h, v29.4H
smlal2 v6.4s, v1.8h, v29.8H
smlal v5.4s, v1.4h, v29.4h
smlal2 v6.4s, v1.8h, v29.8h
uxtl v3.8h, v3.8b
smlal v5.4s, v2.4h, v30.4H
smlal2 v6.4s, v2.8h, v30.8H
smlal v5.4s, v3.4h, v31.4H
smlal v5.4s, v2.4h, v30.4h
smlal2 v6.4s, v2.8h, v30.8h
smlal v5.4s, v3.4h, v31.4h
smlal2 v6.4s, v3.8h, v31.8h
sshr v5.4s, v5.4s, #3
@ -472,8 +472,8 @@ function ff_hscale8to19_4_neon, export=1
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
ld1 {v31.4h}, [x4], #8
uxtl v0.8h, v0.8b
smull v5.4s, v0.4h, v31.4H
saddlv d0, v5.4S
smull v5.4s, v0.4h, v31.4h
saddlv d0, v5.4s
sqshrn s0, d0, #3
smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4
@ -499,42 +499,42 @@ function ff_hscale8to19_X8_neon, export=1
ldr w11, [x5], #4 // filterPos[idx + 2]
add x4, x13, x7 // filter3 = filter2 + filterSize*2
ldr w9, [x5], #4 // filterPos[idx + 3]
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}]
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
subs w15, w15, #8 // j -= 8: processed 8/filterSize
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sshr v0.4s, v0.4S, #3 // shift and clip the 2x16-bit final values
sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
smin v0.4s, v0.4s, v20.4s
st1 {v0.4s}, [x1], #16 // write to destination part0123
b.gt 1b // loop until end of line
@ -588,16 +588,16 @@ function ff_hscale8to19_X4_neon, export=1
smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
ldr d6, [x10], #8 // load src values for idx 2
ldr q29, [x14, x16] // load filter values for idx 2
smlal v17.4s, v5.4h, v30.4H // multiplication of lower half for idx 1
smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
ldr d7, [x11], #8 // load src values for idx 3
smlal2 v17.4s, v5.8h, v30.8H // multiplication of upper half for idx 1
uxtl v6.8h, v6.8B // extend tpye to matchi the filter's size
smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
ldr q28, [x15, x16] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
uxtl v7.8h, v7.8B
smlal2 v18.4s, v6.8h, v29.8H // multiplication of upper half for idx 2
uxtl v7.8h, v7.8b
smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
sub w0, w0, #8
smlal v19.4s, v7.4h, v28.4H // multiplication of lower half for idx 3
smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
cmp w0, #8
smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
add x16, x16, #16 // advance filter values indexing
@ -618,11 +618,11 @@ function ff_hscale8to19_X4_neon, export=1
uxtl v5.8h, v5.8b // extend type to match the filter' size
ldr s6, [x10] // load src values for idx 2
smlal v17.4s, v5.4h, v30.4h
uxtl v6.8h, v6.8B // extend type to match the filter's size
uxtl v6.8h, v6.8b // extend type to match the filter's size
ldr d29, [x14, x17] // load filter values for idx 2
ldr s7, [x11] // load src values for idx 3
addp v16.4s, v16.4s, v17.4s
uxtl v7.8h, v7.8B
uxtl v7.8h, v7.8b
ldr d28, [x15, x17] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h
smlal v19.4s, v7.4h, v28.4h
@ -700,31 +700,31 @@ function ff_hscale16to15_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H
sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H
sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H
sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v29.8H
sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v30.4H
sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v30.8H
sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v31.4H
sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v31.8H
sxtl2 v28.4s, v31.8h
sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s
@ -775,31 +775,31 @@ function ff_hscale16to15_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H
sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H
sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H
sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v29.8H
sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v30.4H
sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v30.8H
sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v31.4H
sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v31.8H
sxtl2 v28.4s, v31.8h
subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s
@ -807,7 +807,7 @@ function ff_hscale16to15_4_neon_asm, export=1
sshl v6.4s, v6.4s, v17.4s
smin v5.4s, v5.4s, v18.4s
smin v6.4s, v6.4s, v18.4s
xtn v5.4h, v5.4S
xtn v5.4h, v5.4s
xtn2 v5.8h, v6.4s
st1 {v5.8h}, [x1], #16
@ -826,7 +826,7 @@ function ff_hscale16to15_4_neon_asm, export=1
uxtl v0.4s, v0.4h
sxtl v31.4s, v31.4h
mul v5.4s, v0.4s, v31.4s
addv s0, v5.4S
addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s
st1 {v0.h}[0], [x1], #2
@ -865,58 +865,58 @@ function ff_hscale16to15_X8_neon_asm, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4H // exted filter lower half
uxtl2 v6.4s, v6.8H // extend srcp upper half
mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4h // exted filter lower half
uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4H // extend srcp lower half
sxtl v23.4s, v17.4H // extend filter lower half
uxtl2 v16.4s, v16.8H // extend srcp upper half
ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4h // extend srcp lower half
sxtl v23.4s, v17.4h // extend filter lower half
uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v28.4s, v18.4H // extend srcp lower half
sxtl v29.4s, v19.4H // extend filter lower half
uxtl2 v18.4s, v18.8H // extend srcp upper half
uxtl v28.4s, v18.4h // extend srcp lower half
sxtl v29.4s, v19.4h // extend filter lower half
uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
xtn v0.4h, v0.4s // narrow down to 16 bits
st1 {v0.4H}, [x1], #8 // write to destination part0123
st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line
ret
endfunc
@ -1108,31 +1108,31 @@ function ff_hscale16to19_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H
sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H
sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H
sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v29.8H
sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v30.4H
sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v30.8H
sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v31.4H
sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v31.8H
sxtl2 v28.4s, v31.8h
sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s
@ -1181,31 +1181,31 @@ function ff_hscale16to19_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H
sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H
sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H
sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v29.8H
sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v30.4H
sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v30.8H
sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v31.4H
sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v31.8H
sxtl2 v28.4s, v31.8h
subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s
@ -1232,7 +1232,7 @@ function ff_hscale16to19_4_neon_asm, export=1
sxtl v31.4s, v31.4h
subs w2, w2, #1
mul v5.4s, v0.4s, v31.4s
addv s0, v5.4S
addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4
@ -1270,52 +1270,52 @@ function ff_hscale16to19_X8_neon_asm, export=1
add x13, x12, x7 // filter2 = filter1 + filterSize*2
lsl w10, w10, #1
add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3])
movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size
2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4H // exted filter lower half
uxtl2 v6.4s, v6.8H // extend srcp upper half
mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4h // exted filter lower half
uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4H // extend srcp lower half
sxtl v23.4s, v17.4H // extend filter lower half
uxtl2 v16.4s, v16.8H // extend srcp upper half
ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4h // extend srcp lower half
sxtl v23.4s, v17.4h // extend filter lower half
uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize
mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v28.4s, v18.4H // extend srcp lower half
sxtl v29.4s, v19.4H // extend filter lower half
uxtl2 v18.4s, v18.8H // extend srcp upper half
uxtl v28.4s, v18.4h // extend srcp lower half
sxtl v29.4s, v19.4h // extend filter lower half
uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding
addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)

View File

@ -29,13 +29,13 @@ function ff_yuv2planeX_8_neon, export=1
// x5 - const uint8_t *dither,
// w6 - int offset
ld1 {v0.8B}, [x5] // load 8x8-bit dither
ld1 {v0.8b}, [x5] // load 8x8-bit dither
and w6, w6, #7
cbz w6, 1f // check if offsetting present
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8H, v0.8B // extend dither to 16-bit
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2)
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
cmp w1, #8 // if filterSize == 8, branch to specialized version
b.eq 6f
cmp w1, #4 // if filterSize == 4, branch to specialized version
@ -48,8 +48,8 @@ function ff_yuv2planeX_8_neon, export=1
mov x7, #0 // i = 0
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
// fs % 2 == 0
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
@ -57,12 +57,12 @@ function ff_yuv2planeX_8_neon, export=1
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
smlal v3.4S, v5.4H, v7.H[0] // val0 += {A,B,C,D} * X
smlal2 v4.4S, v5.8H, v7.H[0] // val1 += {E,F,G,H} * X
smlal v3.4S, v6.4H, v7.H[1] // val0 += {I,J,K,L} * Y
smlal2 v4.4S, v6.8H, v7.H[1] // val1 += {M,N,O,P} * Y
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed
@ -77,17 +77,17 @@ function ff_yuv2planeX_8_neon, export=1
// If filter size is odd (most likely == 1), then use this section.
// fs % 2 != 0
4: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
add x11, x11, x7, lsl #1 // &src[j ][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
smlal v3.4S, v5.4H, v6.H[0] // val0 += {A,B,C,D} * X
smlal2 v4.4S, v5.8H, v6.H[0] // val1 += {E,F,G,H} * X
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
subs w8, w8, #1 // tmpfilterSize -= 2
b.gt 5b // loop until filterSize consumed
@ -107,36 +107,36 @@ function ff_yuv2planeX_8_neon, export=1
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
// load 8x16-bit values for filter[j], where j=0..7
ld1 {v6.8H}, [x0]
ld1 {v6.8h}, [x0]
7:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v28.8H}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
ld1 {v29.8H}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
ld1 {v30.8H}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
ld1 {v31.8H}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4S, v28.4H, v6.H[4] // val0 += src[4][i + {0..3}] * filter[4]
smlal2 v4.4S, v28.8H, v6.H[4] // val1 += src[4][i + {4..7}] * filter[4]
smlal v3.4S, v29.4H, v6.H[5] // val0 += src[5][i + {0..3}] * filter[5]
smlal2 v4.4S, v29.8H, v6.H[5] // val1 += src[5][i + {4..7}] * filter[5]
smlal v3.4S, v30.4H, v6.H[6] // val0 += src[6][i + {0..3}] * filter[6]
smlal2 v4.4S, v30.8H, v6.H[6] // val1 += src[6][i + {4..7}] * filter[6]
smlal v3.4S, v31.4H, v6.H[7] // val0 += src[7][i + {0..3}] * filter[7]
smlal2 v4.4S, v31.8H, v6.H[7] // val1 += src[7][i + {4..7}] * filter[7]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -151,24 +151,24 @@ function ff_yuv2planeX_8_neon, export=1
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
ld1 {v6.4H}, [x0]
ld1 {v6.4h}, [x0]
9:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -184,16 +184,16 @@ function ff_yuv2planeX_8_neon, export=1
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
ldr s6, [x0]
11:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -210,11 +210,11 @@ function ff_yuv2plane1_8_neon, export=1
// w2 - int dstW,
// x3 - const uint8_t *dither,
// w4 - int offset
ld1 {v0.8B}, [x3] // load 8x8-bit dither
ld1 {v0.8b}, [x3] // load 8x8-bit dither
and w4, w4, #7
cbz w4, 1f // check if offsetting present
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8H, v0.8B // extend dither to 32-bit
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
uxtl v1.4s, v0.4h
uxtl2 v2.4s, v0.8h
2:

View File

@ -33,9 +33,9 @@
.macro load_args_nv12
ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
@ -51,9 +51,9 @@
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@ -67,9 +67,9 @@
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8]
dup v0.8H, w10
dup v3.8H, w9
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@ -77,22 +77,22 @@
.endm
.macro load_chroma_nv12
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_nv21
ld2 {v16.8B, v17.8B}, [x6], #16
ushll v19.8H, v16.8B, #3
ushll v18.8H, v17.8B, #3
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v19.8h, v16.8b, #3
ushll v18.8h, v17.8b, #3
.endm
.macro load_chroma_yuv420p
ld1 {v16.8B}, [ x6], #8
ld1 {v17.8B}, [x13], #8
ushll v18.8H, v16.8B, #3
ushll v19.8H, v17.8B, #3
ld1 {v16.8b}, [ x6], #8
ld1 {v17.8b}, [x13], #8
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuv422p
@ -123,18 +123,18 @@
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
add v20.8H, v26.8H, v20.8H // Y1 + R1
add v21.8H, v27.8H, v21.8H // Y2 + R2
add v22.8H, v26.8H, v22.8H // Y1 + G1
add v23.8H, v27.8H, v23.8H // Y2 + G2
add v24.8H, v26.8H, v24.8H // Y1 + B1
add v25.8H, v27.8H, v25.8H // Y2 + B2
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1)
add v20.8h, v26.8h, v20.8h // Y1 + R1
add v21.8h, v27.8h, v21.8h // Y2 + R2
add v22.8h, v26.8h, v22.8h // Y1 + G1
add v23.8h, v27.8h, v23.8h // Y2 + G2
add v24.8h, v26.8h, v24.8h // Y1 + B1
add v25.8h, v27.8h, v25.8h // Y2 + B2
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255
movi \a2, #255
.endm
@ -146,47 +146,47 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
1:
mov w8, w0 // w8 = width
2:
movi v5.8H, #4, lsl #8 // 128 * (1<<3)
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3)
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R)
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G)
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B)
zip2 v21.8H, v20.8H, v20.8H // R2
zip1 v20.8H, v20.8H, v20.8H // R1
zip2 v23.8H, v22.8H, v22.8H // G2
zip1 v22.8H, v22.8H, v22.8H // G1
zip2 v25.8H, v24.8H, v24.8H // B2
zip1 v24.8H, v24.8H, v24.8H // B1
ld1 {v2.16B}, [x4], #16 // load luma
ushll v26.8H, v2.8B, #3 // Y1*(1<<3)
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3)
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
ld1 {v2.16b}, [x4], #16 // load luma
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif
.ifc \ofmt,rgba // 0 1 2 3
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif
.ifc \ofmt,abgr // 3 2 1 0
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif
.ifc \ofmt,bgra // 2 1 0 3
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
subs w8, w8, #16 // width -= 16
b.gt 2b
add x2, x2, w3, SXTW // dst += padding