loongarch: Improve the performance of sad/sad_x3/sad_x4 series functions

Performance has improved from 4.92fps to 6.32fps.
Tested with following command:
./configure && make -j5
./x264 --threads 4 -o out.mkv yuv_1920x1080.yuv

functions           performance     performance
                        (c)            (asm)
sad_4x4                 13               3
sad_4x8                 26               7
sad_4x16                57               13
sad_8x4                 24               3
sad_8x8                 54               8
sad_8x16                108              13
sad_16x8                95               8
sad_16x16               189              13
sad_x3_4x4              37               6
sad_x3_4x8              71               13
sad_x3_8x4              70               8
sad_x3_8x8              162              14
sad_x3_8x16             323              25
sad_x3_16x8             279              15
sad_x3_16x16            555              27
sad_x4_4x4              49               8
sad_x4_4x8              95               17
sad_x4_8x4              94               8
sad_x4_8x8              214              16
sad_x4_8x16             429              33
sad_x4_16x8             372              18
sad_x4_16x16            740              34

Signed-off-by: wanglu <wanglu@loongson.cn>
This commit is contained in:
Loongson Technology Corporation Limited 2023-10-02 22:58:07 +08:00 committed by Shiyou Yin
parent d7d283f634
commit 00b8e3b9cd
4 changed files with 2944 additions and 1 deletions

View File

@ -201,6 +201,7 @@ endif
ifeq ($(SYS_ARCH),LOONGARCH)
ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),)
SRCASM_X += common/loongarch/deblock-a.S \
common/loongarch/sad-a.S \
SRCS_X +=

335
common/loongarch/pixel.h Normal file
View File

@ -0,0 +1,335 @@
/*****************************************************************************
* pixel.h: loongarch pixel metrics
*****************************************************************************
* Copyright (C) 2023 x264 project
*
* Authors: Lu Wang <wanglu@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_PIXEL_H
#define X264_LOONGARCH_PIXEL_H
#define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx)
int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx)
int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx)
int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx)
int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx)
int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx)
int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx)
int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx)
int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx)
int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx)
int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx)
int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx)
int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx)
int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx)
int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx)
int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx)
void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx)
void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx)
void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx)
void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx)
void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx)
void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx)
void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx)
void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx)
void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx)
void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx)
void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx)
void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx)
void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx)
void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx)
void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx)
void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx)
void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx)
void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx)
void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx)
void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx)
int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx)
int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx)
int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx)
int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx)
int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx)
int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx)
int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx)
int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx)
int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx)
uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx)
uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx)
uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx)
uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx)
uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx)
uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx)
uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx)
uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx)
uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx)
uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx)
void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx)
void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx)
void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx)
void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx)
int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx)
int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx)
int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx)
int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx)
int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx)
int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx)
int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx)
int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx)
int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx)
int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx)
int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx)
int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx)
int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx)
int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx)
uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx)
uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx)
uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx)
int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx)
int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx)
int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx)
int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx)
void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx)
void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx)
int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx)
int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx)
void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx)
void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx)
void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx)
void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#endif

2585
common/loongarch/sad-a.S Normal file

File diff suppressed because it is too large Load Diff

View File

@ -45,7 +45,9 @@
#if HAVE_MSA
# include "mips/pixel.h"
#endif
#if HAVE_LSX
# include "loongarch/pixel.h"
#endif
/****************************************************************************
* pixel_sad_WxH
@ -1531,6 +1533,26 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
}
#endif // HAVE_MSA
#if HAVE_LSX
if( cpu&X264_CPU_LSX )
{
INIT8( sad, _lsx );
INIT8_NAME( sad_aligned, sad, _lsx );
INIT7( sad_x3, _lsx );
INIT7( sad_x4, _lsx );
}
if( cpu&X264_CPU_LASX )
{
pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_lasx;
pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_lasx;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_lasx;
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_lasx;
pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_lasx;
pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_lasx;
}
#endif /* HAVE_LSX */
#endif // HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )