loongarch: Improve the performance of w_avg functions

Relative speedup over C code:

w_avg_w4_8bpc_c:                         8.6 ( 1.00x)
w_avg_w4_8bpc_lsx:                       1.0 ( 8.53x)
w_avg_w4_8bpc_lasx:                      1.0 ( 8.79x)
w_avg_w8_8bpc_c:                        24.4 ( 1.00x)
w_avg_w8_8bpc_lsx:                       2.7 ( 8.90x)
w_avg_w8_8bpc_lasx:                      1.6 (15.33x)
w_avg_w16_8bpc_c:                       77.4 ( 1.00x)
w_avg_w16_8bpc_lsx:                      6.9 (11.29x)
w_avg_w16_8bpc_lasx:                     5.2 (14.88x)
w_avg_w32_8bpc_c:                      303.7 ( 1.00x)
w_avg_w32_8bpc_lsx:                     27.2 (11.16x)
w_avg_w32_8bpc_lasx:                    14.2 (21.43x)
w_avg_w64_8bpc_c:                      725.8 ( 1.00x)
w_avg_w64_8bpc_lsx:                     66.1 (10.98x)
w_avg_w64_8bpc_lasx:                    35.4 (20.48x)
w_avg_w128_8bpc_c:                    1812.6 ( 1.00x)
w_avg_w128_8bpc_lsx:                   169.9 (10.67x)
w_avg_w128_8bpc_lasx:                  111.7 (16.23x)
This commit is contained in:
Hao Chen 2023-12-01 10:21:11 +08:00 committed by yuanhecai
parent a23a1e7f81
commit bde69a94bf
2 changed files with 377 additions and 0 deletions

View File

@ -929,3 +929,376 @@ endfunc
warp_lasx , 11
warp_lasx t, 7
/*
static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2,
const int w, int h,
const int weight HIGHBD_DECL_SUFFIX)
*/
#define bpc8_sh 5 // sh = intermediate_bits + 1
#define bpcw8_sh 8 // sh = intermediate_bits + 4
#define bpc_sh bpc8_sh
#define bpcw_sh bpcw8_sh
function w_avg_8bpc_lsx
addi.d t8, a0, 0
li.w t2, 16
sub.w t2, t2, a6 // 16 - weight
vreplgr2vr.h vr21, a6
vreplgr2vr.h vr22, t2
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .W_AVG_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LSX_JRTABLE:
.hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
.W_AVG_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LSX
b .W_AVG_END_LSX
.W_AVG_W8_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.d f0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LSX
b .W_AVG_END_LSX
.W_AVG_W16_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LSX
b .W_AVG_END_LSX
.W_AVG_W32_LSX:
.rept 2
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W32_LSX
b .W_AVG_END_LSX
.W_AVG_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LSX
b .W_AVG_END_LSX
.W_AVG_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LSX
.W_AVG_END_LSX:
endfunc
function w_avg_8bpc_lasx
addi.d t8, a0, 0
li.w t2, 16
sub.w t2, t2, a6 // 16 - weight
xvreplgr2vr.h xr21, a6
xvreplgr2vr.h xr22, t2
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .W_AVG_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LASX_JRTABLE:
.hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
.W_AVG_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
xvpermi.d xr2, xr0, 0xD8
xvpermi.d xr3, xr1, 0xD8
xvilvl.h xr4, xr3, xr2
xvmulwev.w.h xr0, xr4, xr21
xvmaddwod.w.h xr0, xr4, xr22
xvssrarni.hu.w xr1, xr0, bpcw_sh
xvssrlni.bu.h xr0, xr1, 0
fst.s f0, a0, 0
add.d a0, a0, a1
xvstelm.w xr0, a0, 0, 4
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LASX
b .W_AVG_END_LASX
.W_AVG_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvstelm.d xr0, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr0, a0, 0, 2
addi.w a5, a5, -2
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LASX
b .W_AVG_END_LASX
.W_AVG_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvpermi.d xr1, xr0, 0xD8
vst vr1, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LASX
b .W_AVG_END_LSX
.W_AVG_W32_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 64
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .W_AVG_W32_LASX
b .W_AVG_END_LASX
.W_AVG_W64_LASX:
.rept 2
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LASX
b .W_AVG_END_LASX
.W_AVG_W128_LASX:
.rept 4
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LASX
.W_AVG_END_LASX:
endfunc
#undef bpc_sh
#undef bpcw_sh

View File

@ -32,9 +32,11 @@
#include "src/mc.h"
#include "src/cpu.h"
decl_w_avg_fn(BF(dav1d_w_avg, lsx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
decl_w_avg_fn(BF(dav1d_w_avg, lasx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
@ -44,11 +46,13 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
c->w_avg = BF(dav1d_w_avg, lsx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
c->w_avg = BF(dav1d_w_avg, lasx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);