loongarch: Improve the performance of mc_8bpc.mct functions

Relative speedup over C code: mct_8tap_regular_w4_0_8bpc_c: 4.2 ( 1.00x) mct_8tap_regular_w4_0_8bpc_lasx: 0.5 ( 9.08x) mct_8tap_regular_w4_h_8bpc_c: 12.5 ( 1.00x) mct_8tap_regular_w4_h_8bpc_lasx: 1.6 ( 7.80x) mct_8tap_regular_w4_hv_8bpc_c: 33.5 ( 1.00x) mct_8tap_regular_w4_hv_8bpc_lasx: 6.0 ( 5.54x) mct_8tap_regular_w4_v_8bpc_c: 13.6 ( 1.00x) mct_8tap_regular_w4_v_8bpc_lasx: 2.2 ( 6.22x) mct_8tap_regular_w8_0_8bpc_c: 11.3 ( 1.00x) mct_8tap_regular_w8_0_8bpc_lasx: 0.7 (15.77x) mct_8tap_regular_w8_h_8bpc_c: 39.1 ( 1.00x) mct_8tap_regular_w8_h_8bpc_lasx: 4.7 ( 8.30x) mct_8tap_regular_w8_hv_8bpc_c: 90.9 ( 1.00x) mct_8tap_regular_w8_hv_8bpc_lasx: 17.2 ( 5.29x) mct_8tap_regular_w8_v_8bpc_c: 40.5 ( 1.00x) mct_8tap_regular_w8_v_8bpc_lasx: 6.9 ( 5.86x) mct_8tap_regular_w16_0_8bpc_c: 34.3 ( 1.00x) mct_8tap_regular_w16_0_8bpc_lasx: 1.3 (26.32x) mct_8tap_regular_w16_h_8bpc_c: 128.3 ( 1.00x) mct_8tap_regular_w16_h_8bpc_lasx: 20.5 ( 6.26x) mct_8tap_regular_w16_hv_8bpc_c: 273.5 ( 1.00x) mct_8tap_regular_w16_hv_8bpc_lasx: 54.5 ( 5.02x) mct_8tap_regular_w16_v_8bpc_c: 129.7 ( 1.00x) mct_8tap_regular_w16_v_8bpc_lasx: 22.8 ( 5.69x) mct_8tap_regular_w32_0_8bpc_c: 133.7 ( 1.00x) mct_8tap_regular_w32_0_8bpc_lasx: 5.4 (24.65x) mct_8tap_regular_w32_h_8bpc_c: 511.4 ( 1.00x) mct_8tap_regular_w32_h_8bpc_lasx: 85.1 ( 6.01x) mct_8tap_regular_w32_hv_8bpc_c: 1018.2 ( 1.00x) mct_8tap_regular_w32_hv_8bpc_lasx: 210.0 ( 4.85x) mct_8tap_regular_w32_v_8bpc_c: 513.6 ( 1.00x) mct_8tap_regular_w32_v_8bpc_lasx: 88.7 ( 5.79x) mct_8tap_regular_w64_0_8bpc_c: 315.4 ( 1.00x) mct_8tap_regular_w64_0_8bpc_lasx: 13.2 (23.86x) mct_8tap_regular_w64_h_8bpc_c: 1236.8 ( 1.00x) mct_8tap_regular_w64_h_8bpc_lasx: 208.2 ( 5.94x) mct_8tap_regular_w64_hv_8bpc_c: 2428.0 ( 1.00x) mct_8tap_regular_w64_hv_8bpc_lasx: 502.7 ( 4.83x) mct_8tap_regular_w64_v_8bpc_c: 1238.3 ( 1.00x) mct_8tap_regular_w64_v_8bpc_lasx: 214.0 ( 5.79x) mct_8tap_regular_w128_0_8bpc_c: 775.3 ( 1.00x) mct_8tap_regular_w128_0_8bpc_lasx: 32.5 (23.86x) mct_8tap_regular_w128_h_8bpc_c: 3077.5 ( 1.00x) mct_8tap_regular_w128_h_8bpc_lasx: 518.6 ( 5.93x) mct_8tap_regular_w128_hv_8bpc_c: 5987.0 ( 1.00x) mct_8tap_regular_w128_hv_8bpc_lasx: 1242.4 ( 4.82x) mct_8tap_regular_w128_v_8bpc_c: 3077.5 ( 1.00x) mct_8tap_regular_w128_v_8bpc_lasx: 530.3 ( 5.80x)
2023-12-01 11:20:59 +08:00 · 2023-12-01 11:20:59 +08:00 · ae8756ed91
parent b34ecaf310
commit ae8756ed91
2 changed files with 1125 additions and 0 deletions
--- a/src/loongarch/mc.S
+++ b/src/loongarch/mc.S
--- a/src/loongarch/mc.h
+++ b/src/loongarch/mc.h
@ -61,6 +61,16 @@ decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
 decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));

+decl_mct_fn(BF(dav1d_prep_8tap_regular,        lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp,  lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth,         lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp,   lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp,          lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular,  lasx));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth,   lasx));
+
 static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
 #if BITDEPTH == 8
    const unsigned flags = dav1d_get_cpu_flags();
@ -93,6 +103,15 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
    c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
    c->w_mask[2] = BF(dav1d_w_mask_420, lasx);

+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        lasx);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         lasx);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   lasx);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          lasx);
 #endif
 }