loongarch: Improve the performance of mc_8bpc.mct functions

Relative speedup over C code:

mct_8tap_regular_w4_0_8bpc_c:                      4.2 ( 1.00x)
mct_8tap_regular_w4_0_8bpc_lasx:                   0.5 ( 9.08x)
mct_8tap_regular_w4_h_8bpc_c:                     12.5 ( 1.00x)
mct_8tap_regular_w4_h_8bpc_lasx:                   1.6 ( 7.80x)
mct_8tap_regular_w4_hv_8bpc_c:                    33.5 ( 1.00x)
mct_8tap_regular_w4_hv_8bpc_lasx:                  6.0 ( 5.54x)
mct_8tap_regular_w4_v_8bpc_c:                     13.6 ( 1.00x)
mct_8tap_regular_w4_v_8bpc_lasx:                   2.2 ( 6.22x)
mct_8tap_regular_w8_0_8bpc_c:                     11.3 ( 1.00x)
mct_8tap_regular_w8_0_8bpc_lasx:                   0.7 (15.77x)
mct_8tap_regular_w8_h_8bpc_c:                     39.1 ( 1.00x)
mct_8tap_regular_w8_h_8bpc_lasx:                   4.7 ( 8.30x)
mct_8tap_regular_w8_hv_8bpc_c:                    90.9 ( 1.00x)
mct_8tap_regular_w8_hv_8bpc_lasx:                 17.2 ( 5.29x)
mct_8tap_regular_w8_v_8bpc_c:                     40.5 ( 1.00x)
mct_8tap_regular_w8_v_8bpc_lasx:                   6.9 ( 5.86x)
mct_8tap_regular_w16_0_8bpc_c:                    34.3 ( 1.00x)
mct_8tap_regular_w16_0_8bpc_lasx:                  1.3 (26.32x)
mct_8tap_regular_w16_h_8bpc_c:                   128.3 ( 1.00x)
mct_8tap_regular_w16_h_8bpc_lasx:                 20.5 ( 6.26x)
mct_8tap_regular_w16_hv_8bpc_c:                  273.5 ( 1.00x)
mct_8tap_regular_w16_hv_8bpc_lasx:                54.5 ( 5.02x)
mct_8tap_regular_w16_v_8bpc_c:                   129.7 ( 1.00x)
mct_8tap_regular_w16_v_8bpc_lasx:                 22.8 ( 5.69x)
mct_8tap_regular_w32_0_8bpc_c:                   133.7 ( 1.00x)
mct_8tap_regular_w32_0_8bpc_lasx:                  5.4 (24.65x)
mct_8tap_regular_w32_h_8bpc_c:                   511.4 ( 1.00x)
mct_8tap_regular_w32_h_8bpc_lasx:                 85.1 ( 6.01x)
mct_8tap_regular_w32_hv_8bpc_c:                 1018.2 ( 1.00x)
mct_8tap_regular_w32_hv_8bpc_lasx:               210.0 ( 4.85x)
mct_8tap_regular_w32_v_8bpc_c:                   513.6 ( 1.00x)
mct_8tap_regular_w32_v_8bpc_lasx:                 88.7 ( 5.79x)
mct_8tap_regular_w64_0_8bpc_c:                   315.4 ( 1.00x)
mct_8tap_regular_w64_0_8bpc_lasx:                 13.2 (23.86x)
mct_8tap_regular_w64_h_8bpc_c:                  1236.8 ( 1.00x)
mct_8tap_regular_w64_h_8bpc_lasx:                208.2 ( 5.94x)
mct_8tap_regular_w64_hv_8bpc_c:                 2428.0 ( 1.00x)
mct_8tap_regular_w64_hv_8bpc_lasx:               502.7 ( 4.83x)
mct_8tap_regular_w64_v_8bpc_c:                  1238.3 ( 1.00x)
mct_8tap_regular_w64_v_8bpc_lasx:                214.0 ( 5.79x)
mct_8tap_regular_w128_0_8bpc_c:                  775.3 ( 1.00x)
mct_8tap_regular_w128_0_8bpc_lasx:                32.5 (23.86x)
mct_8tap_regular_w128_h_8bpc_c:                 3077.5 ( 1.00x)
mct_8tap_regular_w128_h_8bpc_lasx:               518.6 ( 5.93x)
mct_8tap_regular_w128_hv_8bpc_c:                5987.0 ( 1.00x)
mct_8tap_regular_w128_hv_8bpc_lasx:             1242.4 ( 4.82x)
mct_8tap_regular_w128_v_8bpc_c:                 3077.5 ( 1.00x)
mct_8tap_regular_w128_v_8bpc_lasx:               530.3 ( 5.80x)
This commit is contained in:
jinbo 2023-12-01 11:20:59 +08:00 committed by yuanhecai
parent b34ecaf310
commit ae8756ed91
2 changed files with 1125 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -61,6 +61,16 @@ decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx));
static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
#if BITDEPTH == 8
const unsigned flags = dav1d_get_cpu_flags();
@ -93,6 +103,15 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx);
#endif
}