Improve dct-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/dct-a.S by using the SVE/SVE2 instruction set. Below, the specific functions are listed together with the improved performance results. Command executed: ./checkasm8 --bench=sub Testbed: Alibaba g8y instance based on Yitian 710 CPU Results: sub4x4_dct_c: 528 sub4x4_dct_neon: 322 sub4x4_dct_sve: 247 Command executed: ./checkasm8 --bench=sub Testbed: AWS Graviton3 Results: sub4x4_dct_c: 562 sub4x4_dct_neon: 376 sub4x4_dct_sve: 255 Command executed: ./checkasm8 --bench=add Testbed: Alibaba g8y instance based on Yitian 710 CPU Results: add4x4_idct_c: 698 add4x4_idct_neon: 386 add4x4_idct_sve2: 345 Command executed: ./checkasm8 --bench=zigzag Testbed: Alibaba g8y instance based on Yitian 710 CPU Results: zigzag_interleave_8x8_cavlc_frame_c: 582 zigzag_interleave_8x8_cavlc_frame_neon: 273 zigzag_interleave_8x8_cavlc_frame_sve: 257 Command executed: ./checkasm8 --bench=zigzag Testbed: AWS Graviton3 Results: zigzag_interleave_8x8_cavlc_frame_c: 587 zigzag_interleave_8x8_cavlc_frame_neon: 257 zigzag_interleave_8x8_cavlc_frame_sve: 249
2023-10-30 07:55:27 +02:00 · 2023-10-30 07:55:27 +02:00 · 5c382660fb
parent b6190c6fa1
commit 5c382660fb
5 changed files with 211 additions and 1 deletions
--- a/8
+++ b/8
@ -160,7 +160,7 @@ endif
 OBJCHK += tools/checkasm-arm.o
 endif

-# AArch64 NEON optims
+# AArch64 NEON and SVE/SVE2 optims
 ifeq ($(SYS_ARCH),AARCH64)
 SRCASM_X  = common/aarch64/bitstream-a.S \
            common/aarch64/cabac-a.S \
@ -170,6 +170,12 @@ SRCASM_X  = common/aarch64/bitstream-a.S \
            common/aarch64/pixel-a.S \
            common/aarch64/predict-a.S \
            common/aarch64/quant-a.S
+ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve.S
+endif
+ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve2.S
+endif
 SRCS_X   += common/aarch64/asm-offsets.c \
            common/aarch64/mc-c.c \
            common/aarch64/predict-c.c
--- a/common/aarch64/dct-a-sve.S
+++ b/common/aarch64/dct-a-sve.S
@ -0,0 +1,88 @@
+/****************************************************************************
+ * dct-a-sve.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve
+
+function sub4x4_dct_sve, export=1
+    mov         x3, #FENC_STRIDE
+    mov         x4, #FDEC_STRIDE
+    ptrue       p0.h, vl4
+    ld1b        {z0.h}, p0/z, [x1]
+    add         x1, x1, x3
+    ld1b        {z1.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z2.h}, p0/z, [x1]
+    add         x1, x1, x3
+    sub         v16.4h, v0.4h, v1.4h
+    ld1b        {z3.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z4.h}, p0/z, [x1]
+    add         x1, x1, x3
+    sub         v17.4h, v2.4h, v3.4h
+    ld1b        {z5.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z6.h}, p0/z, [x1]
+    sub         v18.4h, v4.4h, v5.4h
+    ld1b        {z7.h}, p0/z, [x2]
+    sub         v19.4h, v6.4h, v7.4h
+
+    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
+    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
+    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
+    st1         {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
+    ret
+endfunc
+
+function zigzag_interleave_8x8_cavlc_sve, export=1
+    mov         z31.s, #1
+    ptrue       p2.s, vl2
+    ld4         {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
+    ld4         {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
+    umax        v16.8h, v0.8h,  v4.8h
+    umax        v17.8h, v1.8h,  v5.8h
+    umax        v18.8h, v2.8h,  v6.8h
+    umax        v19.8h, v3.8h,  v7.8h
+    st1         {v0.8h}, [x0],  #16
+    st1         {v4.8h}, [x0],  #16
+    umaxp       v16.8h, v16.8h, v17.8h
+    umaxp       v18.8h, v18.8h, v19.8h
+    st1         {v1.8h}, [x0],  #16
+    st1         {v5.8h}, [x0],  #16
+    umaxp       v16.8h, v16.8h, v18.8h
+    st1         {v2.8h}, [x0],  #16
+    st1         {v6.8h}, [x0],  #16
+    cmhs        v16.4s, v16.4s, v31.4s
+    st1         {v3.8h}, [x0],  #16
+    and         v16.16b, v16.16b, v31.16b
+    st1         {v7.8h}, [x0],  #16
+    st1b        {z16.s}, p2, [x2]
+    add         x2, x2, #8
+    mov         v16.d[0], v16.d[1]
+    st1b        {z16.s}, p2, [x2]
+    ret
+endfunc
--- a/common/aarch64/dct-a-sve2.S
+++ b/common/aarch64/dct-a-sve2.S
@ -0,0 +1,89 @@
+/****************************************************************************
+ * dct-a-sve2.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve+sve2
+
+function add4x4_idct_sve2, export=1
+    mov         x2, #FDEC_STRIDE
+    mov         x11, x0
+    ptrue       p0.h, vl8
+    ptrue       p1.h, vl4
+    ld1         {v0.8h, v1.8h}, [x1]
+
+    SUMSUB_AB   v4.8h, v5.8h, v0.8h, v1.8h
+
+    sshr        v7.8h, v0.8h, #1
+    sshr        v6.8h, v1.8h, #1
+    sub         v7.8h, v7.8h, v1.8h
+    add         v6.8h, v6.8h, v0.8h
+    mov         v7.d[0], v7.d[1]
+    mov         v6.d[0], v6.d[1]
+    ld1b        {z28.h}, p0/z, [x11]
+    add         x11, x11, x2
+    SUMSUB_AB   v0.8h, v2.8h, v4.8h, v6.8h
+    SUMSUB_AB   v1.8h, v3.8h, v5.8h, v7.8h
+
+    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
+
+    SUMSUB_AB   v4.4h, v5.4h, v0.4h, v3.4h
+
+    sshr        v7.4h, v1.4h, #1
+    sshr        v6.4h, v2.4h, #1
+    sub         v7.4h, v7.4h, v2.4h
+    add         v6.4h, v6.4h, v1.4h
+    ld1b        {z29.h}, p0/z, [x11]
+    add         x11, x11, x2
+    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
+    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
+
+    srshr       z0.h, p1/m, z0.h, #6
+    srshr       z1.h, p1/m, z1.h, #6
+    ld1b        {z31.h}, p0/z, [x11]
+    add         x11, x11, x2
+    srshr       z2.h, p1/m, z2.h, #6
+    srshr       z3.h, p1/m, z3.h, #6
+    ld1b        {z30.h}, p0/z, [x11]
+
+    add         v0.8h, v0.8h, v28.8h
+    add         v1.8h, v1.8h, v29.8h
+    add         v2.8h, v2.8h, v30.8h
+    add         v3.8h, v3.8h, v31.8h
+    sqxtunb     z0.b, z0.h
+    sqxtunb     z1.b, z1.h
+    sqxtunb     z2.b, z2.h
+    sqxtunb     z3.b, z3.h
+
+    st1b        {z0.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z1.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z3.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z2.h}, p1, [x0]
+    ret
+endfunc
--- a/common/aarch64/dct.h
+++ b/common/aarch64/dct.h
@ -91,4 +91,13 @@ int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel
 #define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
 void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );

+#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
+void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
+void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
+
+#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
+void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+
 #endif
--- a/common/dct.c
+++ b/common/dct.c
@ -707,6 +707,18 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf )
        dctf->add16x16_idct8= x264_add16x16_idct8_neon;
        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
    }
+#if HAVE_SVE
+    if ( cpu&X264_CPU_SVE )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_sve;
+    }
+#endif
+#if HAVE_SVE2
+    if ( cpu&X264_CPU_SVE2 )
+    {
+        dctf->add4x4_idct   = x264_add4x4_idct_sve2;
+    }
+#endif
 #endif

 #if HAVE_MSA
@ -1105,6 +1117,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
        pf_interlaced->interleave_8x8_cavlc =
        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
    }
+#if HAVE_SVE
+    if( cpu&X264_CPU_SVE )
+    {
+        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_sve;
+    }
+#endif
 #endif // HAVE_AARCH64

 #if HAVE_ALTIVEC