loongarch: Improve the performance of mask_c, w_mask_420 functions

Relative speedup over C code:

mask_w4_8bpc_c:                             9.2 ( 1.00x)
mask_w4_8bpc_lsx:                           1.1 ( 8.31x)
mask_w4_8bpc_lasx:                          1.2 ( 7.42x)
mask_w8_8bpc_c:                            27.4 ( 1.00x)
mask_w8_8bpc_lsx:                           2.6 (10.54x)
mask_w8_8bpc_lasx:                          1.9 (14.65x)
mask_w16_8bpc_c:                           87.2 ( 1.00x)
mask_w16_8bpc_lsx:                          8.0 (10.92x)
mask_w16_8bpc_lasx:                         6.5 (13.46x)
mask_w32_8bpc_c:                          343.4 ( 1.00x)
mask_w32_8bpc_lsx:                         31.7 (10.84x)
mask_w32_8bpc_lasx:                        22.1 (15.51x)
mask_w64_8bpc_c:                          824.9 ( 1.00x)
mask_w64_8bpc_lsx:                         78.0 (10.57x)
mask_w64_8bpc_lasx:                        54.1 (15.25x)
mask_w128_8bpc_c:                        2042.9 ( 1.00x)
mask_w128_8bpc_lsx:                       200.7 (10.18x)
mask_w128_8bpc_lasx:                      157.1 (13.00x)

w_mask_420_w4_8bpc_c:                      19.0 ( 1.00x)
w_mask_420_w4_8bpc_lsx:                     1.7 (11.11x)
w_mask_420_w4_8bpc_lasx:                    1.2 (15.87x)
w_mask_420_w8_8bpc_c:                      58.2 ( 1.00x)
w_mask_420_w8_8bpc_lsx:                     4.6 (12.58x)
w_mask_420_w8_8bpc_lasx:                    2.5 (23.74x)
w_mask_420_w16_8bpc_c:                    188.0 ( 1.00x)
w_mask_420_w16_8bpc_lsx:                   11.8 (15.88x)
w_mask_420_w16_8bpc_lasx:                   8.3 (22.66x)
w_mask_420_w32_8bpc_c:                    742.2 ( 1.00x)
w_mask_420_w32_8bpc_lsx:                   47.3 (15.68x)
w_mask_420_w32_8bpc_lasx:                  32.7 (22.68x)
w_mask_420_w64_8bpc_c:                   1786.3 ( 1.00x)
w_mask_420_w64_8bpc_lsx:                  112.4 (15.89x)
w_mask_420_w64_8bpc_lasx:                  78.4 (22.78x)
w_mask_420_w128_8bpc_c:                  4442.2 ( 1.00x)
w_mask_420_w128_8bpc_lsx:                 298.9 (14.86x)
w_mask_420_w128_8bpc_lasx:                220.5 (20.15x)
This commit is contained in:
yuanhecai 2023-12-01 10:29:51 +08:00
parent bde69a94bf
commit 4080673c17
2 changed files with 1041 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -33,12 +33,16 @@
#include "src/cpu.h"
decl_w_avg_fn(BF(dav1d_w_avg, lsx));
decl_mask_fn(BF(dav1d_mask, lsx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
decl_w_avg_fn(BF(dav1d_w_avg, lasx));
decl_mask_fn(BF(dav1d_mask, lasx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
#if BITDEPTH == 8
@ -47,14 +51,18 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
c->w_avg = BF(dav1d_w_avg, lsx);
c->mask = BF(dav1d_mask, lsx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
c->w_avg = BF(dav1d_w_avg, lasx);
c->mask = BF(dav1d_mask, lasx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
#endif
}