diff --git a/Makefile b/Makefile index ee1451c7..02235748 100644 --- a/Makefile +++ b/Makefile @@ -201,6 +201,7 @@ endif ifeq ($(SYS_ARCH),LOONGARCH) ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),) SRCASM_X += common/loongarch/deblock-a.S \ + common/loongarch/sad-a.S \ SRCS_X += diff --git a/common/loongarch/pixel.h b/common/loongarch/pixel.h new file mode 100644 index 00000000..347eecc2 --- /dev/null +++ b/common/loongarch/pixel.h @@ -0,0 +1,335 @@ +/***************************************************************************** + * pixel.h: loongarch pixel metrics + ***************************************************************************** + * Copyright (C) 2023 x264 project + * + * Authors: Lu Wang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_PIXEL_H +#define X264_LOONGARCH_PIXEL_H + +#define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx) +int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx) +int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx) +int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx) +int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx) +int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx) +int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx) +int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx) +int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx) +int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx) +int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx) +int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx) +int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx) +int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx) +int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx) +int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx) +void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx) +void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx) +void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx) +void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx) +void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx) +void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); + +#define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx) +void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx) +void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx) +void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx) +void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx) +void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); + +#define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx) +void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx) +void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx) +void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx) +void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx) +void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx) +void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx) +void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); + +#define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx) +void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx) +void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); + +#define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx) +int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx) +int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx) +int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx) +int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx) +int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx) +int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx) +int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx) +int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx) +int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx) +uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx) +uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx) +uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx) +uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx) +uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx) +uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx) +uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx) +uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx) +uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx) +uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx) +void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx) +void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx) +void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx) +void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); + +#define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx) +int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx) +int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx) +int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx) +int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx) +int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx) +int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx) +int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx) +int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx) +int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx) +int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx) +int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx) +int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx) +int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx) +int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx) +uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx) +uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx) +uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx) +int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx) +int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); + +#define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx) +int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx) +int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx) +void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx) +void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx) +int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx) +int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx) +void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx) +void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx) +void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx) +void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); + +#endif diff --git a/common/loongarch/sad-a.S b/common/loongarch/sad-a.S new file mode 100644 index 00000000..e4b10c61 --- /dev/null +++ b/common/loongarch/sad-a.S @@ -0,0 +1,2585 @@ +/***************************************************************************** + * sad-a.S: loongarch sad functions + ***************************************************************************** + * Copyright (C) 2023 x264 project + * + * Authors: Lu Wang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +#if !HIGH_BIT_DEPTH + + +/* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x16_lasx + slli.d t1, a5, 1 + add.d t2, a5, t1 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 0 + xvld xr16, a0, 32 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr12, xr8, xr8 + xvhaddw.hu.bu xr13, xr9, xr9 + xvhaddw.hu.bu xr14, xr10, xr10 + xvhaddw.hu.bu xr15, xr11, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 64 + xvld xr16, a0, 96 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 128 + xvld xr16, a0, 160 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 192 + xvld xr16, a0, 224 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + xvori.b xr17, xr12, 0 + xvori.b xr18, xr13, 0 + xvpermi.q xr12, xr14, 0x02 + xvpermi.q xr14, xr17, 0x31 + xvpermi.q xr13, xr15, 0x02 + xvpermi.q xr15, xr18, 0x31 + xvadd.h xr12, xr12, xr14 + xvadd.h xr13, xr13, xr15 + xvhaddw.w.h xr12, xr12, xr12 + xvhaddw.w.h xr13, xr13, xr13 + xvhaddw.d.w xr12, xr12, xr12 + xvhaddw.d.w xr13, xr13, xr13 + xvhaddw.q.d xr12, xr12, xr12 + xvhaddw.q.d xr13, xr13, xr13 + xvpackev.w xr13, xr13, xr12 + // Store data to p_sad_array + xvstelm.d xr13, a6, 0, 0 + xvstelm.d xr13, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x8_lasx + slli.d t1, a5, 1 + add.d t2, a5, t1 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 0 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr12, xr8, xr8 + xvhaddw.hu.bu xr13, xr9, xr9 + xvhaddw.hu.bu xr14, xr10, xr10 + xvhaddw.hu.bu xr15, xr11, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 32 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 64 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 96 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + xvori.b xr17, xr12, 0 + xvori.b xr18, xr13, 0 + xvpermi.q xr12, xr14, 0x02 + xvpermi.q xr14, xr17, 0x31 + xvpermi.q xr13, xr15, 0x02 + xvpermi.q xr15, xr18, 0x31 + xvadd.h xr12, xr12, xr14 + xvadd.h xr13, xr13, xr15 + xvhaddw.w.h xr12, xr12, xr12 + xvhaddw.w.h xr13, xr13, xr13 + xvhaddw.d.w xr12, xr12, xr12 + xvhaddw.d.w xr13, xr13, xr13 + xvhaddw.q.d xr12, xr12, xr12 + xvhaddw.q.d xr13, xr13, xr13 + xvpackev.w xr13, xr13, xr12 + // Store data to p_sad_array + xvstelm.d xr13, a6, 0, 0 + xvstelm.d xr13, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x8_lasx + slli.d t1, a5, 1 + add.d t2, t1, a5 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vilvl.d vr14, vr15, vr14 + vilvl.d vr16, vr17, vr16 + vilvl.d vr18, vr19, vr18 + vilvl.d vr20, vr21, vr20 + xvpermi.q xr4, xr6, 0x02 + xvpermi.q xr8, xr10, 0x02 + xvpermi.q xr14, xr16, 0x02 + xvpermi.q xr18, xr20, 0x02 + // Calculate the absolute value of the difference + xvldrepl.d xr3, a0, 0 + xvabsd.bu xr5, xr3, xr4 + xvldrepl.d xr3, a0, 16 + xvabsd.bu xr9, xr3, xr8 + xvldrepl.d xr3, a0, 32 + xvabsd.bu xr10, xr3, xr14 + xvldrepl.d xr3, a0, 48 + xvabsd.bu xr11, xr3, xr18 + xvaddwev.h.bu xr0, xr5, xr9 + xvaddwod.h.bu xr1, xr5, xr9 + xvaddwev.h.bu xr2, xr10, xr11 + xvaddwod.h.bu xr22, xr10, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vilvl.d vr14, vr15, vr14 + vilvl.d vr16, vr17, vr16 + vilvl.d vr18, vr19, vr18 + vilvl.d vr20, vr21, vr20 + xvpermi.q xr4, xr6, 0x02 + xvpermi.q xr8, xr10, 0x02 + xvpermi.q xr14, xr16, 0x02 + xvpermi.q xr18, xr20, 0x02 + // Calculate the absolute value of the difference + xvldrepl.d xr3, a0, 64 + xvabsd.bu xr5, xr3, xr4 + xvldrepl.d xr3, a0, 80 + xvabsd.bu xr9, xr3, xr8 + xvldrepl.d xr3, a0, 96 + xvabsd.bu xr10, xr3, xr14 + xvldrepl.d xr3, a0, 112 + xvabsd.bu xr11, xr3, xr18 + xvaddwev.h.bu xr12, xr5, xr9 + xvaddwod.h.bu xr13, xr5, xr9 + xvaddwev.h.bu xr14, xr10, xr11 + xvaddwod.h.bu xr15, xr10, xr11 + xvadd.h xr5, xr0, xr12 + xvadd.h xr9, xr1, xr13 + xvadd.h xr10, xr2, xr14 + xvadd.h xr11, xr22, xr15 + xvadd.h xr5, xr5, xr9 + xvadd.h xr10, xr10, xr11 + xvadd.h xr10, xr10, xr5 + xvhaddw.wu.hu xr10, xr10, xr10 + xvhaddw.du.wu xr10, xr10, xr10 + xvpermi.q xr5, xr10, 0x01 + xvpickev.w xr10, xr5, xr10 + // Store data to p_sad_array + vst vr10, a6, 0 +endfunc_x264 + +/* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x4_lasx + slli.d t1, a5, 1 + add.d t2, t1, a5 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + fld.d f2, a0, 0 + fld.d f3, a0, 16 + fld.d f12, a0, 32 + fld.d f13, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + + vilvl.d vr3, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr7, vr11, vr7 + vilvl.d vr13, vr13, vr12 + vilvl.d vr14, vr18, vr14 + vilvl.d vr15, vr19, vr15 + vilvl.d vr16, vr20, vr16 + vilvl.d vr17, vr21, vr17 + xvpermi.q xr3, xr13, 0x02 + xvpermi.q xr4, xr16, 0x02 + xvpermi.q xr5, xr17, 0x02 + xvpermi.q xr6, xr14, 0x02 + xvpermi.q xr7, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvpermi.d xr10, xr10, 0x4e + xvpermi.d xr11, xr11, 0x4e + xvadd.h xr8, xr8, xr10 + xvadd.h xr9, xr9, xr11 + xvhaddw.w.h xr8, xr8, xr8 + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr8, xr8, xr8 + xvhaddw.q.d xr9, xr9, xr9 + xvpackev.w xr9, xr9, xr8 + + // Store data to p_sad_array + xvstelm.d xr9, a6, 0, 0 + xvstelm.d xr9, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_4x4_lsx + slli.d t0, a5, 1 + add.d t1, a5, t0 + slli.d t2, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + fld.s f2, a0, 0 + fld.s f3, a0, 16 + fld.s f4, a1, 0 + fldx.s f8, a1, a5 + fld.s f5, a2, 0 + fldx.s f9, a2, a5 + fld.s f6, a3, 0 + fldx.s f10, a3, a5 + fld.s f7, a4, 0 + fldx.s f11, a4, a5 + vilvl.w vr3, vr3, vr2 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr7, vr11, vr7 + + fld.s f2, a0, 32 + fld.s f0, a0, 48 + fldx.s f8, a1, t0 + fldx.s f12, a1, t1 + fldx.s f9, a2, t0 + fldx.s f13, a2, t1 + fldx.s f10, a3, t0 + fldx.s f14, a3, t1 + fldx.s f11, a4, t0 + fldx.s f15, a4, t1 + vilvl.w vr2, vr0, vr2 + vilvl.w vr8, vr12, vr8 + vilvl.w vr9, vr13, vr9 + vilvl.w vr10, vr14, vr10 + vilvl.w vr11, vr15, vr11 + vilvl.d vr3, vr2, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr7, vr11, vr7 + + // Calculate the absolute value of the difference + vabsd.bu vr8, vr3, vr4 + vabsd.bu vr9, vr3, vr5 + vabsd.bu vr10, vr3, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.du.wu vr10, vr10, vr10 + vhaddw.du.wu vr11, vr11, vr11 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + vhaddw.qu.du vr10, vr10, vr10 + vhaddw.qu.du vr11, vr11, vr11 + + // Store data to p_sad_array + vstelm.w vr8, a6, 0, 0 + vstelm.w vr9, a6, 4, 0 + vstelm.w vr10, a6, 8, 0 + vstelm.w vr11, a6, 12, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x16_lasx + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + slli.d t1, a4, 1 + add.d t2, a4, t1 + slli.d t3, a4, 2 + + xvld xr2, a0, 0 + xvld xr3, a0, 32 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr16, xr7, xr7 + xvhaddw.hu.bu xr17, xr8, xr8 + xvhaddw.hu.bu xr18, xr9, xr9 + xvhaddw.hu.bu xr19, xr10, xr10 + xvhaddw.hu.bu xr20, xr11, xr11 + xvhaddw.hu.bu xr21, xr12, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 64 + xvld xr3, a0, 96 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 128 + xvld xr3, a0, 160 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 192 + xvld xr3, a0, 224 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + xvadd.h xr11, xr16, xr19 + xvadd.h xr12, xr17, xr20 + xvadd.h xr13, xr18, xr21 + + xvhaddw.wu.hu xr11, xr11, xr11 + xvhaddw.wu.hu xr12, xr12, xr12 + xvhaddw.wu.hu xr13, xr13, xr13 + xvhaddw.du.wu xr11, xr11, xr11 + xvhaddw.du.wu xr12, xr12, xr12 + xvhaddw.du.wu xr13, xr13, xr13 + xvhaddw.qu.du xr11, xr11, xr11 + xvhaddw.qu.du xr12, xr12, xr12 + xvhaddw.qu.du xr13, xr13, xr13 + xvpickve.w xr17, xr11, 4 + xvpickve.w xr18, xr12, 4 + xvpickve.w xr19, xr13, 4 + xvadd.w xr11, xr11, xr17 + xvadd.w xr12, xr12, xr18 + xvadd.w xr13, xr13, xr19 + + // Store data to p_sad_array + vstelm.w vr11, a5, 0, 0 + vstelm.w vr12, a5, 4, 0 + vstelm.w vr13, a5, 8, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x8_lasx + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + slli.d t1, a4, 1 + add.d t2, a4, t1 + slli.d t3, a4, 2 + + xvld xr2, a0, 0 + xvld xr3, a0, 32 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr16, xr7, xr7 + xvhaddw.hu.bu xr17, xr8, xr8 + xvhaddw.hu.bu xr18, xr9, xr9 + xvhaddw.hu.bu xr19, xr10, xr10 + xvhaddw.hu.bu xr20, xr11, xr11 + xvhaddw.hu.bu xr21, xr12, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 64 + xvld xr3, a0, 96 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + xvadd.h xr11, xr16, xr19 + xvadd.h xr12, xr17, xr20 + xvadd.h xr13, xr18, xr21 + + xvhaddw.wu.hu xr11, xr11, xr11 + xvhaddw.wu.hu xr12, xr12, xr12 + xvhaddw.wu.hu xr13, xr13, xr13 + xvhaddw.du.wu xr11, xr11, xr11 + xvhaddw.du.wu xr12, xr12, xr12 + xvhaddw.du.wu xr13, xr13, xr13 + xvhaddw.qu.du xr11, xr11, xr11 + xvhaddw.qu.du xr12, xr12, xr12 + xvhaddw.qu.du xr13, xr13, xr13 + xvpickve.w xr17, xr11, 4 + xvpickve.w xr18, xr12, 4 + xvpickve.w xr19, xr13, 4 + xvadd.w xr11, xr11, xr17 + xvadd.w xr12, xr12, xr18 + xvadd.w xr13, xr13, xr19 + + // Store data to p_sad_array + vstelm.w vr11, a5, 0, 0 + vstelm.w vr12, a5, 4, 0 + vstelm.w vr13, a5, 8, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_4x4_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.s f3, a0, 0 + fld.s f7, a0, 16 + fld.s f11, a0, 32 + fld.s f15, a0, 48 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + + // Calculate the absolute value of the difference + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x4_lasx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr4, vr6, vr4 + vilvl.d vr7, vr9, vr7 + vilvl.d vr8, vr10, vr8 + xvpermi.q xr3, xr7, 0x02 + xvpermi.q xr4, xr8, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr5, xr3, xr4 + xvhaddw.hu.bu xr6, xr5, xr5 + xvhaddw.wu.hu xr6, xr6, xr6 + xvhaddw.du.wu xr6, xr6, xr6 + xvhaddw.qu.du xr6, xr6, xr6 + + xvpickve2gr.wu t2, xr6, 0 + xvpickve2gr.wu t3, xr6, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x4_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + + // Calculate the absolute value of the difference + vabsd.bu vr5, vr3, vr4 + vhaddw.hu.bu vr6, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr11, vr3, vr4 + vhaddw.hu.bu vr11, vr11, vr11 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr5, vr3, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + + vadd.h vr6, vr11, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr11, vr3, vr4 + vhaddw.hu.bu vr11, vr11, vr11 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr12, vr3, vr4 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr11, vr11, vr12 +.endr + + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.du.wu vr11, vr11, vr11 + vhaddw.qu.du vr11, vr11, vr11 + vpickve2gr.wu a0, vr11, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x4_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr6, vr11, vr12 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr13, vr11, vr12 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr6, vr11, vr12 + vadd.h vr6, vr6, vr13 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr13, vr11, vr12 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr14, vr11, vr12 + vadd.h vr13, vr13, vr14 +.endr + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_16x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr14, vr8, vr9 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr12, vr8, vr9 + + vadd.h vr13, vr12, vr14 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_16x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr13, vr8, vr9 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr12, vr8, vr9 + vadd.h vr13, vr12, vr13 +.endr + + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_4x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.s f3, a0, 0 + fld.s f7, a0, 16 + fld.s f11, a0, 32 + fld.s f15, a0, 48 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + vabsd.bu vr0, vr3, vr4 + vabsd.bu vr1, vr3, vr5 + vabsd.bu vr2, vr3, vr6 + + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + fld.s f3, a0, 64 + fld.s f7, a0, 80 + fld.s f11, a0, 96 + fld.s f15, a0, 112 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + + vhaddw.hu.bu vr0, vr0, vr0 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vadd.h vr7, vr7, vr0 + vadd.h vr8, vr8, vr1 + vadd.h vr9, vr9, vr2 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x4_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr0, vr3, vr4 + vabsd.bu vr1, vr3, vr5 + vabsd.bu vr2, vr3, vr6 + vabsd.bu vr3, vr11, vr12 + vabsd.bu vr4, vr11, vr13 + vabsd.bu vr5, vr11, vr14 + vhaddw.hu.bu vr0, vr0, vr0 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr3, vr3, vr3 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vadd.h vr7, vr0, vr3 + vadd.h vr8, vr1, vr4 + vadd.h vr9, vr2, vr5 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr0, vr7, vr10 + vadd.h vr1, vr8, vr15 + vadd.h vr2, vr9, vr16 + + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + fld.d f3, a0, 64 + fld.d f7, a0, 80 + fld.d f11, a0, 96 + fld.d f15, a0, 112 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr7, vr7, vr10 + vadd.h vr8, vr8, vr15 + vadd.h vr9, vr9, vr16 + + vadd.h vr7, vr7, vr0 + vadd.h vr8, vr8, vr1 + vadd.h vr9, vr9, vr2 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x16_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr0, vr7, vr10 + vadd.h vr1, vr8, vr15 + vadd.h vr2, vr9, vr16 + +.rept 3 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + addi.d a0, a0, 64 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr7, vr7, vr10 + vadd.h vr8, vr8, vr15 + vadd.h vr9, vr9, vr16 + vadd.h vr0, vr7, vr0 + vadd.h vr1, vr8, vr1 + vadd.h vr2, vr9, vr2 +.endr + + vhaddw.wu.hu vr0, vr0, vr0 + vhaddw.wu.hu vr1, vr1, vr1 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr0, vr0, vr0 + vhaddw.du.wu vr1, vr1, vr1 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr0, vr0, vr0 + vhaddw.qu.du vr1, vr1, vr1 + vhaddw.qu.du vr2, vr2, vr2 + + // Store data to p_sad_array + vstelm.w vr0, a5, 0, 0 + vstelm.w vr1, a5, 4, 0 + vstelm.w vr2, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr16, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr17, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr18, vr1, vr0 + + // vr16, vr17, vr18 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + vld vr0, a0, 64 + vld vr1, a0, 80 + vld vr2, a0, 96 + vld vr3, a0, 112 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr2, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr3, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr4, vr1, vr0 + + vadd.h vr0, vr16, vr2 + vadd.h vr1, vr17, vr3 + vadd.h vr2, vr18, vr4 + vhaddw.wu.hu vr0, vr0, vr0 + vhaddw.wu.hu vr1, vr1, vr1 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr0, vr0, vr0 + vhaddw.du.wu vr1, vr1, vr1 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr0, vr0, vr0 + vhaddw.qu.du vr1, vr1, vr1 + vhaddw.qu.du vr2, vr2, vr2 + + // Store data to p_sad_array + vstelm.w vr0, a5, 0, 0 + vstelm.w vr1, a5, 4, 0 + vstelm.w vr2, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x16_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr16, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr17, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr18, vr1, vr0 + +.rept 3 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + addi.d a0, a0, 64 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr2, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr3, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr4, vr1, vr0 + + vadd.h vr16, vr16, vr2 + vadd.h vr17, vr17, vr3 + vadd.h vr18, vr18, vr4 +.endr + + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + + // Store data to p_sad_array + vstelm.w vr16, a5, 0, 0 + vstelm.w vr17, a5, 4, 0 + vstelm.w vr18, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_4x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + fld.s f0, a0, 0 + fld.s f1, a0, 16 + fld.s f2, a0, 32 + fld.s f3, a0, 48 + FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.d vr0, vr2, vr0 + vilvl.w vr4, vr8, vr4 + vilvl.w vr12, vr16, vr12 + vilvl.d vr1, vr12, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr13, vr17, vr13 + vilvl.d vr2, vr13, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr14, vr6 + vilvl.w vr7, vr11, vr7 + vilvl.w vr15, vr19, vr15 + vilvl.d vr4, vr15, vr7 + vabsd.bu vr1, vr0, vr1 + vabsd.bu vr2, vr0, vr2 + vabsd.bu vr3, vr0, vr3 + vabsd.bu vr4, vr0, vr4 + vhaddw.hu.bu vr20, vr1, vr1 + vhaddw.hu.bu vr21, vr2, vr2 + vhaddw.hu.bu vr22, vr3, vr3 + vhaddw.hu.bu vr23, vr4, vr4 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + fld.s f0, a0, 64 + fld.s f1, a0, 80 + fld.s f2, a0, 96 + fld.s f3, a0, 112 + FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.d vr0, vr2, vr0 + vilvl.w vr4, vr8, vr4 + vilvl.w vr12, vr16, vr12 + vilvl.d vr1, vr12, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr13, vr17, vr13 + vilvl.d vr2, vr13, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr14, vr6 + vilvl.w vr7, vr11, vr7 + vilvl.w vr15, vr19, vr15 + vilvl.d vr4, vr15, vr7 + vabsd.bu vr1, vr0, vr1 + vabsd.bu vr2, vr0, vr2 + vabsd.bu vr3, vr0, vr3 + vabsd.bu vr4, vr0, vr4 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr3, vr3, vr3 + vhaddw.hu.bu vr4, vr4, vr4 + vadd.h vr16, vr20, vr1 + vadd.h vr17, vr21, vr2 + vadd.h vr18, vr22, vr3 + vadd.h vr19, vr23, vr4 + + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x4_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr20, vr4, vr12 + vadd.h vr21, vr5, vr13 + vadd.h vr22, vr6, vr14 + vadd.h vr23, vr7, vr15 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + fld.d f0, a0, 64 + fld.d f1, a0, 80 + fld.d f2, a0, 96 + fld.d f3, a0, 112 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + + vadd.h vr16, vr16, vr20 + vadd.h vr17, vr17, vr21 + vadd.h vr18, vr18, vr22 + vadd.h vr19, vr19, vr23 + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x16_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr20, vr4, vr12 + vadd.h vr21, vr5, vr13 + vadd.h vr22, vr6, vr14 + vadd.h vr23, vr7, vr15 + +.rept 3 + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + addi.d a0, a0, 64 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 +.endr + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr20, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr21, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr22, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr23, vr0, vr1 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + vld vr0, a0, 64 + vld vr1, a0, 80 + vld vr2, a0, 96 + vld vr3, a0, 112 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr16, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr17, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr18, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr19, vr0, vr1 + + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x16_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr20, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr21, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr22, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr23, vr0, vr1 + +.rept 3 + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + addi.d a0, a0, 64 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr16, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr17, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr18, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr19, vr0, vr1 + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 +.endr + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 +#endif /* !HIGH_BIT_DEPTH */ diff --git a/common/pixel.c b/common/pixel.c index c7b39c8b..34972c69 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -45,7 +45,9 @@ #if HAVE_MSA # include "mips/pixel.h" #endif - +#if HAVE_LSX +# include "loongarch/pixel.h" +#endif /**************************************************************************** * pixel_sad_WxH @@ -1531,6 +1533,26 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) } #endif // HAVE_MSA +#if HAVE_LSX + if( cpu&X264_CPU_LSX ) + { + INIT8( sad, _lsx ); + INIT8_NAME( sad_aligned, sad, _lsx ); + INIT7( sad_x3, _lsx ); + INIT7( sad_x4, _lsx ); + } + + if( cpu&X264_CPU_LASX ) + { + pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_lasx; + pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_lasx; + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_lasx; + pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_lasx; + pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_lasx; + pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_lasx; + } +#endif /* HAVE_LSX */ + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC )