x264/common/loongarch/deblock-a.S

1619 lines
56 KiB
ArmAsm

/*****************************************************************************
* deblock-a.S: loongarch deblock functions
*****************************************************************************
* Copyright (C) 2023-2024 x264 project
*
* Authors: Hao Chen <chenhao@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "loongson_asm.S"
#include "loongson_util.S"
#if !HIGH_BIT_DEPTH
const shuf_loc_locn
.byte 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28
.byte 16, 24, 0, 8, 17, 25, 1, 9, 18, 26, 2, 10, 19, 27, 3, 11
endconst
const shuf_locn
.byte 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
endconst
/*Transpose 16 * 6 block with byte elements in vectors*/
.macro LASX_TRANSPOSE in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15,\
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,\
out0, out1, out2, out3, out4, out5
xvilvl.b \tmp0, \in1, \in0
xvilvl.b \tmp1, \in3, \in2
xvilvl.b \tmp2, \in5, \in4
xvilvl.b \tmp3, \in7, \in6
xvilvl.b \tmp4, \in9, \in8
xvilvl.b \tmp5, \in11, \in10
xvilvl.b \tmp6, \in13, \in12
xvilvl.b \tmp7, \in15, \in14
xvpermi.d \tmp0, \tmp0, 0xD8
xvpermi.d \tmp1, \tmp1, 0xD8
xvpermi.d \tmp2, \tmp2, 0xD8
xvpermi.d \tmp3, \tmp3, 0xD8
xvpermi.d \tmp4, \tmp4, 0xD8
xvpermi.d \tmp5, \tmp5, 0xD8
xvpermi.d \tmp6, \tmp6, 0xD8
xvpermi.d \tmp7, \tmp7, 0xD8
xvilvl.h \out0, \tmp1, \tmp0
xvilvl.h \out1, \tmp3, \tmp2
xvilvl.h \out2, \tmp5, \tmp4
xvilvl.h \out3, \tmp7, \tmp6
xvilvl.w \tmp0, \out1, \out0
xvilvh.w \tmp1, \out1, \out0
xvilvl.w \tmp2, \out3, \out2
xvilvh.w \tmp3, \out3, \out2
xvilvl.d \out0, \tmp2, \tmp0
xvilvh.d \out1, \tmp2, \tmp0
xvilvl.d \out2, \tmp3, \tmp1
xvilvh.d \out3, \tmp3, \tmp1
xvpermi.d \out4, \out0, 0x4E
xvpermi.d \out5, \out1, 0x4E
.endm
/*
* void deblock_h_luma_lasx(Pixel *pix, intptr_t stride, int alpha,
* int beta, int8_t *tc0)
*/
function_x264 deblock_h_luma_lasx
slli.d t0, a1, 1
slli.d t2, a1, 2
xvldrepl.w xr1, a4, 0
add.d t1, t0, a1
xvreplgr2vr.b xr2, a3
xvilvl.b xr1, xr1, xr1
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
// Load data from pix
addi.d t4, a0, -3
FLDD_LOADX_4 t4, a1, t0, t1, f10, f11, f12, f13
add.d t5, t4, t2
FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23
add.d t6, t5, t2
FLDD_LOADX_4 t6, a1, t0, t1, f24, f25, f26, f27
LASX_TRANSPOSE xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \
xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \
xr8, xr9, xr18, xr19, xr28, xr29, xr30, xr31, \
xr10, xr11, xr12, xr13, xr14, xr15
xvilvl.h xr1, xr1, xr1
vext2xv.hu.bu xr20, xr10
vext2xv.hu.bu xr21, xr11
vext2xv.hu.bu xr22, xr12
vext2xv.hu.bu xr23, xr13
vext2xv.hu.bu xr24, xr14
vext2xv.hu.bu xr25, xr15
vext2xv.h.b xr3, xr1
xvadd.h xr26, xr22, xr23
xvsrari.h xr26, xr26, 1
xvneg.h xr4, xr3
xvadd.h xr27, xr20, xr26
xvadd.h xr28, xr25, xr26
xvsub.h xr29, xr23, xr22
xvsrai.h xr27, xr27, 1
xvsrai.h xr28, xr28, 1
xvslli.h xr29, xr29, 2
xvsub.h xr30, xr21, xr24
xvsub.h xr27, xr27, xr21
xvsub.h xr28, xr28, xr24
xvadd.h xr29, xr29, xr30
xvclip.h xr27, xr27, xr4, xr3
xvclip.h xr28, xr28, xr4, xr3
xvpickev.b xr16, xr25, xr20
xvpickev.b xr17, xr23, xr22
xvabsd.bu xr5, xr16, xr17
xvaddi.hu xr6, xr3, 1
xvslt.bu xr5, xr5, xr2
xvilvl.b xr30, xr5, xr5
xvilvh.b xr31, xr5, xr5
xvbitsel.v xr3, xr3, xr6, xr30
xvsrari.h xr29, xr29, 3
xvaddi.hu xr6, xr3, 1
xvbitsel.v xr3, xr3, xr6, xr31
xvneg.h xr4, xr3
xvclip.h xr29, xr29, xr4, xr3
xvadd.h xr30, xr21, xr27
xvadd.h xr18, xr24, xr28
xvadd.h xr19, xr22, xr29
xvsub.h xr26, xr23, xr29
xvssrarni.bu.h xr26, xr19, 0
xvpickev.b xr25, xr18, xr30
xvpickev.b xr27, xr24, xr21
xvpickev.b xr28, xr23, xr22
xvpickev.b xr18, xr22, xr21
xvabsd.bu xr19, xr18, xr17
xvreplgr2vr.b xr30, a2
xvilvl.d xr31, xr30, xr2
xvabsd.bu xr20, xr14, xr13
xvslt.bu xr19, xr19, xr31
xvslt.bu xr20, xr20, xr2
xvbitsel.v xr25, xr27, xr25, xr5
xvpermi.d xr20, xr20, 0x50
xvand.v xr21, xr20, xr19
xvpermi.d xr7, xr21, 0xB1
xvand.v xr21, xr21, xr7
xvbitsel.v xr25, xr27, xr25, xr21
xvpermi.d xr1, xr1, 0x50
xvbitsel.v xr26, xr28, xr26, xr21
xvslti.b xr30, xr1, 0
xvbitsel.v xr25, xr25, xr27, xr30
xvbitsel.v xr26, xr26, xr28, xr30
xvilvl.b xr10, xr26, xr25
xvilvh.b xr20, xr25, xr26
xvilvl.h xr21, xr20, xr10
xvilvh.h xr22, xr20, xr10
// Store data to pix
addi.d t5, a0, -2
xvstelm.w xr21, t5, 0, 0
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 1
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 2
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 3
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 0
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 1
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 2
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 3
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 4
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 5
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 6
add.d t5, t5, a1
xvstelm.w xr21, t5, 0, 7
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 4
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 5
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 6
add.d t5, t5, a1
xvstelm.w xr22, t5, 0, 7
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void deblock_v_luma_lasx(Pixel *pix, intptr_t stride,
* int alpha, int beta, int8_t *tc0)
*/
function_x264 deblock_v_luma_lasx
slli.d t0, a1, 1
// Load data from tc0
xvldrepl.w xr1, a4, 0
add.d t1, t0, a1
xvreplgr2vr.b xr2, a3
xvilvl.b xr1, xr1, xr1
// Load data from pix
sub.d t5, a0, t1
vld vr10, t5, 0
vldx vr11, t5, a1
vldx vr12, t5, t0
vld vr13, a0, 0
vldx vr14, a0, a1
vldx vr15, a0, t0
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
xvilvl.h xr1, xr1, xr1
vext2xv.hu.bu xr20, xr10
vext2xv.hu.bu xr21, xr11
vext2xv.hu.bu xr22, xr12
vext2xv.hu.bu xr23, xr13
vext2xv.hu.bu xr24, xr14
vext2xv.hu.bu xr25, xr15
vext2xv.h.b xr3, xr1
xvadd.h xr26, xr22, xr23
xvsrari.h xr26, xr26, 1
xvneg.h xr4, xr3
xvadd.h xr27, xr20, xr26
xvadd.h xr28, xr25, xr26
xvsub.h xr29, xr23, xr22
xvsrai.h xr27, xr27, 1
xvsrai.h xr28, xr28, 1
xvslli.h xr29, xr29, 2
xvsub.h xr30, xr21, xr24
xvsub.h xr27, xr27, xr21
xvsub.h xr28, xr28, xr24
xvadd.h xr29, xr29, xr30
xvclip.h xr27, xr27, xr4, xr3
xvclip.h xr28, xr28, xr4, xr3
xvpickev.b xr16, xr25, xr20
xvpickev.b xr17, xr23, xr22
xvabsd.bu xr5, xr16, xr17
xvaddi.hu xr6, xr3, 1
xvslt.bu xr5, xr5, xr2
xvilvl.b xr30, xr5, xr5
xvilvh.b xr31, xr5, xr5
xvbitsel.v xr3, xr3, xr6, xr30
xvsrari.h xr29, xr29, 3
xvaddi.hu xr6, xr3, 1
xvbitsel.v xr3, xr3, xr6, xr31
xvneg.h xr4, xr3
xvclip.h xr29, xr29, xr4, xr3
xvadd.h xr30, xr21, xr27
xvadd.h xr18, xr24, xr28
xvadd.h xr19, xr22, xr29
xvsub.h xr26, xr23, xr29
xvssrarni.bu.h xr26, xr19, 0
xvpickev.b xr25, xr18, xr30
xvpickev.b xr27, xr24, xr21
xvpickev.b xr28, xr23, xr22
xvpickev.b xr18, xr22, xr21
xvabsd.bu xr19, xr18, xr17
xvreplgr2vr.b xr30, a2
xvilvl.d xr31, xr30, xr2
xvabsd.bu xr20, xr14, xr13
xvslt.bu xr19, xr19, xr31
xvslt.bu xr20, xr20, xr2
xvbitsel.v xr25, xr27, xr25, xr5
xvpermi.d xr20, xr20, 0x50
xvand.v xr21, xr20, xr19
xvpermi.d xr7, xr21, 0xB1
xvand.v xr21, xr21, xr7
xvbitsel.v xr25, xr27, xr25, xr21
xvpermi.d xr1, xr1, 0x50
xvbitsel.v xr26, xr28, xr26, xr21
xvslti.b xr30, xr1, 0
xvbitsel.v xr25, xr25, xr27, xr30
xvbitsel.v xr26, xr26, xr28, xr30
sub.d t5, a0, t0
xvpermi.d xr0, xr25, 0xd8
xvpermi.d xr1, xr26, 0xd8
xvpermi.d xr2, xr26, 0x8D
xvpermi.d xr3, xr25, 0x8D
// Store data to pix
vst vr0, t5, 0
vstx vr1, t5, a1
vst vr2, a0, 0
vstx vr3, a0, a1
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void deblock_v_luma_intra_lasx(Pixel *pix, intptr_t stride,
* int alpha, int beta)
*/
function_x264 deblock_v_luma_intra_lasx
slli.d t0, a1, 1
slli.d t2, a1, 2
add.d t1, t0, a1
// Load data from pix
sub.d t5, a0, t2
vld vr9, t5, 0
vldx vr10, t5, a1
vldx vr11, t5, t0
vldx vr12, t5, t1
vld vr13, a0, 0
vldx vr14, a0, a1
vldx vr15, a0, t0
vldx vr16, a0, t1
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
xvreplgr2vr.b xr1, a2
xvreplgr2vr.b xr2, a3
vext2xv.hu.bu xr19, xr9
vext2xv.hu.bu xr20, xr10
vext2xv.hu.bu xr21, xr11
vext2xv.hu.bu xr22, xr12
vext2xv.hu.bu xr23, xr13
vext2xv.hu.bu xr24, xr14
vext2xv.hu.bu xr25, xr15
vext2xv.hu.bu xr26, xr16
xvadd.h xr27, xr21, xr22
xvadd.h xr29, xr19, xr20
xvadd.h xr3, xr27, xr23
xvadd.h xr6, xr27, xr24
xvadd.h xr4, xr3, xr20
xvslli.h xr29, xr29, 1
xvadd.h xr5, xr6, xr4
xvadd.h xr6, xr6, xr21
xvadd.h xr5, xr5, xr23
xvadd.h xr7, xr29, xr4
xvsrari.h xr3, xr4, 2
xvsrari.h xr6, xr6, 2
xvsrari.h xr4, xr5, 3
xvadd.h xr27, xr24, xr23
xvadd.h xr28, xr26, xr25
xvsrari.h xr5, xr7, 3
xvadd.h xr29, xr22, xr27
xvslli.h xr28, xr28, 1
xvadd.h xr7, xr29, xr25
xvadd.h xr17, xr27, xr21
xvadd.h xr8, xr7, xr28
xvadd.h xr18, xr17, xr7
xvadd.h xr17, xr17, xr24
xvadd.h xr18, xr18, xr22
xvsrari.h xr7, xr7, 2
xvsrari.h xr8, xr8, 3
xvsrari.h xr18, xr18, 3
xvsrari.h xr17, xr17, 2
xvpickev.b xr27, xr25, xr20
xvpickev.b xr28, xr24, xr21
xvpickev.b xr29, xr23, xr22
xvpickev.b xr9, xr8, xr5
xvpickev.b xr16, xr7, xr3
xvabsd.bu xr30, xr27, xr29
xvpickev.b xr19, xr18, xr4
xvpickev.b xr26, xr17, xr6
xvslt.bu xr31, xr30, xr2
xvabsd.bu xr20, xr12, xr13
xvabsd.bu xr21, xr11, xr12
xvabsd.bu xr22, xr14, xr13
xvsrli.b xr0, xr1, 2
xvbitsel.v xr19, xr26, xr19, xr31
xvbitsel.v xr9, xr27, xr9, xr31
xvbitsel.v xr16, xr28, xr16, xr31
xvaddi.bu xr0, xr0, 2
xvpermi.d xr20, xr20, 0x50
xvpermi.d xr21, xr21, 0x50
xvpermi.d xr22, xr22, 0x50
xvslt.bu xr10, xr20, xr0
xvslt.bu xr11, xr20, xr1
xvslt.bu xr12, xr21, xr2
xvslt.bu xr13, xr22, xr2
xvand.v xr30, xr11, xr12
xvand.v xr30, xr30, xr13
xvbitsel.v xr9, xr27, xr9, xr10
xvbitsel.v xr16, xr28, xr16, xr10
xvbitsel.v xr19, xr26, xr19, xr10
xvbitsel.v xr9, xr27, xr9, xr30
xvbitsel.v xr16, xr28, xr16, xr30
xvbitsel.v xr19, xr29, xr19, xr30
xvpermi.d xr1, xr9, 0xD8
xvpermi.d xr2, xr16, 0xD8
xvpermi.d xr3, xr19, 0xD8
xvpermi.d xr4, xr19, 0x8D
xvpermi.d xr5, xr16, 0x8D
xvpermi.d xr6, xr9, 0x8D
// Store data to pix
vstx vr1, t5, a1
vstx vr2, t5, t0
vstx vr3, t5, t1
vst vr4, a0, 0
vstx vr5, a0, a1
vstx vr6, a0, t0
// Restore register values
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void deblock_h_luma_intra_lasx(Pixel *pix, intptr_t stride,
* int alpha, int beta)
*/
function_x264 deblock_h_luma_intra_lasx
slli.d t0, a1, 1
slli.d t2, a1, 2
addi.d t5, a0, -4
add.d t1, t0, a1
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
// Load data from pix
FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27
LASX_TRANSPOSE16X8_B xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \
xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \
xr9, xr10, xr11, xr12, xr13, xr14, xr15, xr16, \
xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7
xvreplgr2vr.b xr1, a2
xvreplgr2vr.b xr2, a3
vext2xv.hu.bu xr19, xr9
vext2xv.hu.bu xr20, xr10
vext2xv.hu.bu xr21, xr11
vext2xv.hu.bu xr22, xr12
vext2xv.hu.bu xr23, xr13
vext2xv.hu.bu xr24, xr14
vext2xv.hu.bu xr25, xr15
vext2xv.hu.bu xr26, xr16
xvadd.h xr27, xr21, xr22
xvadd.h xr29, xr19, xr20
xvadd.h xr3, xr27, xr23
xvadd.h xr6, xr27, xr24
xvadd.h xr4, xr3, xr20
xvslli.h xr29, xr29, 1
xvadd.h xr5, xr6, xr4
xvadd.h xr6, xr6, xr21
xvadd.h xr5, xr5, xr23
xvadd.h xr7, xr29, xr4
xvsrari.h xr3, xr4, 2
xvsrari.h xr6, xr6, 2
xvsrari.h xr4, xr5, 3
xvadd.h xr27, xr24, xr23
xvadd.h xr28, xr26, xr25
xvsrari.h xr5, xr7, 3
xvadd.h xr29, xr22, xr27
xvslli.h xr28, xr28, 1
xvadd.h xr7, xr29, xr25
xvadd.h xr17, xr27, xr21
xvadd.h xr8, xr7, xr28
xvadd.h xr18, xr17, xr7
xvadd.h xr17, xr17, xr24
xvadd.h xr18, xr18, xr22
xvsrari.h xr7, xr7, 2
xvsrari.h xr8, xr8, 3
xvsrari.h xr18, xr18, 3
xvsrari.h xr17, xr17, 2
xvpickev.b xr27, xr25, xr20
xvpickev.b xr28, xr24, xr21
xvpickev.b xr29, xr23, xr22
xvpickev.b xr9, xr8, xr5
xvpickev.b xr16, xr7, xr3
xvabsd.bu xr30, xr27, xr29
xvpickev.b xr19, xr18, xr4
xvpickev.b xr26, xr17, xr6
xvslt.bu xr31, xr30, xr2
xvabsd.bu xr20, xr12, xr13
xvabsd.bu xr21, xr11, xr12
xvabsd.bu xr22, xr14, xr13
xvsrli.b xr0, xr1, 2
xvbitsel.v xr19, xr26, xr19, xr31
xvbitsel.v xr9, xr27, xr9, xr31
xvbitsel.v xr16, xr28, xr16, xr31
xvaddi.bu xr0, xr0, 2
xvpermi.d xr20, xr20, 0x50
xvpermi.d xr21, xr21, 0x50
xvpermi.d xr22, xr22, 0x50
xvslt.bu xr10, xr20, xr0
xvslt.bu xr11, xr20, xr1
xvslt.bu xr12, xr21, xr2
xvslt.bu xr13, xr22, xr2
xvand.v xr30, xr11, xr12
xvand.v xr30, xr30, xr13
xvbitsel.v xr9, xr27, xr9, xr10
xvbitsel.v xr16, xr28, xr16, xr10
xvbitsel.v xr19, xr26, xr19, xr10
xvbitsel.v xr9, xr27, xr9, xr30
xvbitsel.v xr16, xr28, xr16, xr30
xvbitsel.v xr19, xr29, xr19, xr30
xvilvl.b xr0, xr16, xr9
xvpermi.d xr18, xr19, 0xB1
xvilvh.b xr1, xr9, xr16
xvilvl.b xr2, xr18, xr19
addi.d t5, a0, -3
xvilvl.h xr3, xr2, xr0
xvilvh.h xr4, xr2, xr0
// Store data to pix
xvstelm.w xr3, t5, 0, 0
xvstelm.h xr1, t5, 4, 0
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 1
xvstelm.h xr1, t5, 4, 1
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 2
xvstelm.h xr1, t5, 4, 2
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 3
xvstelm.h xr1, t5, 4, 3
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 0
xvstelm.h xr1, t5, 4, 4
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 1
xvstelm.h xr1, t5, 4, 5
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 2
xvstelm.h xr1, t5, 4, 6
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 3
xvstelm.h xr1, t5, 4, 7
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 4
xvstelm.h xr1, t5, 4, 8
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 5
xvstelm.h xr1, t5, 4, 9
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 6
xvstelm.h xr1, t5, 4, 10
add.d t5, t5, a1
xvstelm.w xr3, t5, 0, 7
xvstelm.h xr1, t5, 4, 11
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 4
xvstelm.h xr1, t5, 4, 12
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 5
xvstelm.h xr1, t5, 4, 13
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 6
xvstelm.h xr1, t5, 4, 14
add.d t5, t5, a1
xvstelm.w xr4, t5, 0, 7
xvstelm.h xr1, t5, 4, 15
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
* int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
* int mvy_limit, int bframe )
*/
function_x264 deblock_strength_lasx
// dir = 0 s1 = 8 s2 = 1
vldi vr18, 2
vldi vr19, 1
addi.d t0, zero, 4
xvreplgr2vr.h xr20, t0
xvreplgr2vr.h xr21, a4
xvld xr0, a0, 11
xvpermi.q xr1, xr0, 0x01
la.local t0, shuf_loc_locn
xvld xr23, t0, 0
xvshuf.b xr4, xr1, xr0, xr23
xvpermi.q xr5, xr4, 0x01
vor.v vr6, vr4, vr5
vseqi.b vr6, vr6, 0
vmov vr15, vr6
vxor.v vr8, vr8, vr8
vbitsel.v vr8, vr18, vr8, vr6
xvld xr0, a1, 11
xvpermi.q xr1, xr0, 0x01
xvshuf.b xr4, xr1, xr0, xr23
xvpermi.q xr5, xr4, 0x01
vseq.b vr4, vr4, vr5
vseqi.b vr4, vr4, 0
vld vr0, a2, 44
vld vr1, a2, 76
vld vr5, a2, 108
vld vr6, a2, 140
vilvl.h vr9, vr1, vr0
vilvl.h vr10, vr6, vr5
vilvl.w vr11, vr10, vr9
vilvh.w vr12, vr10, vr9
vilvh.h vr9, vr1, vr0
vilvh.h vr10, vr6, vr5
vilvl.w vr13, vr10, vr9
vilvh.w vr14, vr10, vr9
vilvl.d vr0, vr13, vr12
ld.h t0, a2, 60
ld.h t1, a2, 92
ld.h t2, a2, 124
ld.h t3, a2, 156
vmov vr6, vr14
vinsgr2vr.h vr6, t0, 4
vinsgr2vr.h vr6, t1, 5
vinsgr2vr.h vr6, t2, 6
vinsgr2vr.h vr6, t3, 7
vilvl.d vr1, vr12, vr11
vilvl.d vr5, vr14, vr13
xvpermi.q xr0, xr6, 0x02 // mv[0][loc][0]
xvpermi.q xr5, xr1, 0x20 // mv[0][locn][0]
xvabsd.h xr5, xr0, xr5
xvsle.h xr5, xr20, xr5
vilvh.d vr0, vr13, vr12
ld.h t0, a2, 62
ld.h t1, a2, 94
ld.h t2, a2, 126
ld.h t3, a2, 158
vbsrl.v vr7, vr14, 8
vinsgr2vr.h vr7, t0, 4
vinsgr2vr.h vr7, t1, 5
vinsgr2vr.h vr7, t2, 6
vinsgr2vr.h vr7, t3, 7
vilvh.d vr1, vr12, vr11
vilvh.d vr6, vr14, vr13
xvpermi.q xr0, xr7, 0x02 // mv[0][loc][1]
xvpermi.q xr6, xr1, 0x20 // mv[0][locn][1]
xvabsd.h xr6, xr0, xr6
xvsle.h xr6, xr21, xr6
xvor.v xr5, xr5, xr6
xvpickev.b xr5, xr5, xr5
xvpermi.d xr5, xr5, 0xd8
vor.v vr17, vr4, vr5
beqz a5, .bframe_iszero_0
// bframe != 0
xvld xr0, a1, 51
xvpermi.q xr1, xr0, 0x01
xvshuf.b xr4, xr1, xr0, xr23
xvpermi.q xr5, xr4, 0x01
vseq.b vr4, vr4, vr5
vseqi.b vr4, vr4, 0
vld vr0, a2, 204
vld vr1, a2, 236
vld vr5, a2, 268
vld vr6, a2, 300
vilvl.h vr9, vr1, vr0
vilvl.h vr10, vr6, vr5
vilvl.w vr11, vr10, vr9
vilvh.w vr12, vr10, vr9
vilvh.h vr9, vr1, vr0
vilvh.h vr10, vr6, vr5
vilvl.w vr13, vr10, vr9
vilvh.w vr14, vr10, vr9
vilvl.d vr0, vr13, vr12
ld.h t0, a2, 220
ld.h t1, a2, 252
ld.h t2, a2, 284
ld.h t3, a2, 316
vmov vr6, vr14
vinsgr2vr.h vr6, t0, 4
vinsgr2vr.h vr6, t1, 5
vinsgr2vr.h vr6, t2, 6
vinsgr2vr.h vr6, t3, 7
vilvl.d vr1, vr12, vr11
vilvl.d vr5, vr14, vr13
xvpermi.q xr0, xr6, 0x02 // mv[1][loc][0]
xvpermi.q xr5, xr1, 0x20 // mv[1][locn][0]
xvabsd.h xr5, xr0, xr5
xvsle.h xr5, xr20, xr5
vilvh.d vr0, vr13, vr12
ld.h t0, a2, 222
ld.h t1, a2, 254
ld.h t2, a2, 286
ld.h t3, a2, 318
vbsrl.v vr7, vr14, 8
vinsgr2vr.h vr7, t0, 4
vinsgr2vr.h vr7, t1, 5
vinsgr2vr.h vr7, t2, 6
vinsgr2vr.h vr7, t3, 7
vilvh.d vr1, vr12, vr11
vilvh.d vr6, vr14, vr13
xvpermi.q xr0, xr7, 0x02 // mv[1][loc][1]
xvpermi.q xr6, xr1, 0x20 // mv[1][locn][1]
xvabsd.h xr6, xr0, xr6
xvsle.h xr6, xr21, xr6
xvor.v xr5, xr5, xr6
xvpickev.b xr5, xr5, xr5
xvpermi.d xr5, xr5, 0xd8
vor.v vr5, vr5, vr4
vor.v vr17, vr5, vr17
.bframe_iszero_0:
vxor.v vr22, vr22, vr22
vbitsel.v vr22, vr22, vr19, vr17
vbitsel.v vr22, vr8, vr22, vr15
vst vr22, a3, 0
// dir = 1 s1 = 1 s2 = 8
vld vr0, a0, 4
vld vr1, a0, 20
ld.wu t0, a0, 36
vpickev.w vr2, vr1, vr0
vbsrl.v vr3, vr2, 4
vinsgr2vr.w vr3, t0, 3
vor.v vr2, vr3, vr2
vseqi.b vr2, vr2, 0
vmov vr15, vr2
vxor.v vr3, vr3, vr3
vbitsel.v vr3, vr18, vr3, vr2
vld vr0, a1, 4
vld vr1, a1, 20
ld.w t0, a1, 36
vpickev.w vr2, vr1, vr0
vbsrl.v vr4, vr2, 4
vinsgr2vr.w vr4, t0, 3
vseq.b vr2, vr4, vr2
vseqi.b vr2, vr2, 0
vld vr0, a2, 16
vld vr1, a2, 48
vld vr12, a2, 80
vld vr13, a2, 112
vld vr4, a2, 144
vpickev.h vr5, vr1, vr0
vpickev.h vr14, vr13, vr12
xvpermi.q xr5, xr14, 0x02 // mv[0][locn][0]
vpickev.h vr7, vr4, vr4
xvpermi.d xr6, xr5, 0x39
xvinsve0.d xr6, xr7, 3 // mv[0][loc][0]
xvabsd.h xr5, xr6, xr5
xvsle.h xr5, xr20, xr5
vpickod.h vr6, vr1, vr0
vpickod.h vr14, vr13, vr12
xvpermi.q xr6, xr14, 0x02 // mv[0][locn][1]
vpickod.h vr7, vr4, vr4
xvpermi.d xr8, xr6, 0x39
xvinsve0.d xr8, xr7, 3 // mv[0][loc][1]
xvabsd.h xr6, xr8, xr6
xvsle.h xr6, xr21, xr6
xvor.v xr5, xr6, xr5
xvpickev.b xr6, xr5, xr5
xvpermi.d xr6, xr6, 0xd8
vor.v vr2, vr6, vr2
beqz a5, .bframe_iszero_1
// bframe != 0 ref[1]
vld vr0, a1, 44
vld vr1, a1, 60
ld.w t0, a1, 76
vpickev.w vr0, vr1, vr0
vbsrl.v vr1, vr0, 4
vinsgr2vr.w vr1, t0, 3
vseq.b vr11, vr1, vr0
vseqi.b vr11, vr11, 0
vld vr0, a2, 176
vld vr1, a2, 208
vld vr12, a2, 240
vld vr13, a2, 272
vld vr4, a2, 304
vpickev.h vr5, vr1, vr0
vpickev.h vr14, vr13, vr12
xvpermi.q xr5, xr14, 0x02 // mv[1][locn][0]
vpickev.h vr7, vr4, vr4
xvpermi.d xr6, xr5, 0x39
xvinsve0.d xr6, xr7, 3 // mv[1][loc][0]
xvabsd.h xr5, xr6, xr5
xvsle.h xr5, xr20, xr5
vpickod.h vr6, vr1, vr0
vpickod.h vr14, vr13, vr12
xvpermi.q xr6, xr14, 0x02 // mv[1][locn][1]
vpickod.h vr7, vr4, vr4
xvpermi.d xr8, xr6, 0x39
xvinsve0.d xr8, xr7, 3 // mv[1][loc][1]
xvabsd.h xr6, xr8, xr6
xvsle.h xr6, xr21, xr6
xvor.v xr5, xr6, xr5
xvpickev.b xr6, xr5, xr5
xvpermi.d xr6, xr6, 0xd8
vor.v vr6, vr6, vr11
vor.v vr2, vr6, vr2
.bframe_iszero_1:
vxor.v vr22, vr22, vr22
vbitsel.v vr22, vr22, vr19, vr2
vbitsel.v vr22, vr3, vr22, vr15
vst vr22, a3, 32
endfunc_x264
/*
* void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
* int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
* int mvy_limit, int bframe )
*/
function_x264 deblock_strength_lsx
// dir = 0 s1 = 8 s2 = 1
vldi vr18, 2
vldi vr19, 1
addi.d t0, zero, 4
vreplgr2vr.h vr20, t0
vreplgr2vr.h vr21, a4
vld vr0, a0, 11
vld vr1, a0, 27
la.local t0, shuf_loc_locn
la.local t1, shuf_locn
vld vr2, t0, 0
vld vr3, t1, 0
vshuf.b vr4, vr1, vr0, vr2
vshuf.b vr5, vr1, vr0, vr3
vor.v vr6, vr4, vr5
vseqi.b vr6, vr6, 0
vmov vr15, vr6
vxor.v vr8, vr8, vr8
vbitsel.v vr8, vr18, vr8, vr6
vld vr0, a1, 11
vld vr1, a1, 27
vshuf.b vr4, vr1, vr0, vr2
vshuf.b vr5, vr1, vr0, vr3
vseq.b vr4, vr4, vr5
vseqi.b vr4, vr4, 0
vld vr0, a2, 44
vld vr1, a2, 76
vld vr5, a2, 108
vld vr6, a2, 140
vilvl.h vr9, vr1, vr0
vilvl.h vr10, vr6, vr5
vilvl.w vr11, vr10, vr9
vilvh.w vr12, vr10, vr9
vilvh.h vr9, vr1, vr0
vilvh.h vr10, vr6, vr5
vilvl.w vr13, vr10, vr9
vilvh.w vr14, vr10, vr9
vilvl.d vr0, vr13, vr12
ld.h t0, a2, 60
ld.h t1, a2, 92
ld.h t2, a2, 124
ld.h t3, a2, 156
vmov vr6, vr14
vinsgr2vr.h vr6, t0, 4
vinsgr2vr.h vr6, t1, 5
vinsgr2vr.h vr6, t2, 6
vinsgr2vr.h vr6, t3, 7
vilvl.d vr1, vr12, vr11
vilvl.d vr5, vr14, vr13
vabsd.h vr9, vr0, vr1
vabsd.h vr5, vr6, vr5
vsle.h vr9, vr20, vr9
vsle.h vr5, vr20, vr5
vilvh.d vr0, vr13, vr12
ld.h t0, a2, 62
ld.h t1, a2, 94
ld.h t2, a2, 126
ld.h t3, a2, 158
vbsrl.v vr7, vr14, 8
vinsgr2vr.h vr7, t0, 4
vinsgr2vr.h vr7, t1, 5
vinsgr2vr.h vr7, t2, 6
vinsgr2vr.h vr7, t3, 7
vilvh.d vr1, vr12, vr11
vilvh.d vr6, vr14, vr13
vabsd.h vr0, vr0, vr1
vabsd.h vr6, vr7, vr6
vsle.h vr0, vr21, vr0
vsle.h vr6, vr21, vr6
vor.v vr9, vr9, vr0
vor.v vr5, vr5, vr6
vpickev.b vr5, vr5, vr9
vor.v vr17, vr4, vr5
beqz a5, .bframeiszero_0_lsx
// bframe != 0
vld vr0, a1, 51
vld vr1, a1, 67
vshuf.b vr4, vr1, vr0, vr2
vshuf.b vr5, vr1, vr0, vr3
vseq.b vr4, vr4, vr5
vseqi.b vr4, vr4, 0
vld vr0, a2, 204
vld vr1, a2, 236
vld vr5, a2, 268
vld vr6, a2, 300
vilvl.h vr9, vr1, vr0
vilvl.h vr10, vr6, vr5
vilvl.w vr11, vr10, vr9
vilvh.w vr12, vr10, vr9
vilvh.h vr9, vr1, vr0
vilvh.h vr10, vr6, vr5
vilvl.w vr13, vr10, vr9
vilvh.w vr14, vr10, vr9
vilvl.d vr0, vr13, vr12
ld.h t0, a2, 220
ld.h t1, a2, 252
ld.h t2, a2, 284
ld.h t3, a2, 316
vmov vr6, vr14
vinsgr2vr.h vr6, t0, 4
vinsgr2vr.h vr6, t1, 5
vinsgr2vr.h vr6, t2, 6
vinsgr2vr.h vr6, t3, 7
vilvl.d vr1, vr12, vr11
vilvl.d vr5, vr14, vr13
vabsd.h vr9, vr0, vr1
vabsd.h vr5, vr6, vr5
vsle.h vr9, vr20, vr9
vsle.h vr5, vr20, vr5
vilvh.d vr0, vr13, vr12
ld.h t0, a2, 222
ld.h t1, a2, 254
ld.h t2, a2, 286
ld.h t3, a2, 318
vbsrl.v vr7, vr14, 8
vinsgr2vr.h vr7, t0, 4
vinsgr2vr.h vr7, t1, 5
vinsgr2vr.h vr7, t2, 6
vinsgr2vr.h vr7, t3, 7
vilvh.d vr1, vr12, vr11
vilvh.d vr6, vr14, vr13
vabsd.h vr0, vr0, vr1
vabsd.h vr6, vr7, vr6
vsle.h vr0, vr21, vr0
vsle.h vr6, vr21, vr6
vor.v vr9, vr9, vr0
vor.v vr5, vr5, vr6
vpickev.b vr5, vr5, vr9
vor.v vr5, vr5, vr4
vor.v vr17, vr5, vr17
.bframeiszero_0_lsx:
vxor.v vr22, vr22, vr22
vbitsel.v vr22, vr22, vr19, vr17
vbitsel.v vr22, vr8, vr22, vr15
vst vr22, a3, 0
// dir = 1 s1 = 1 s2 = 8
vld vr0, a0, 4
vld vr1, a0, 20
ld.wu t0, a0, 36
vpickev.w vr2, vr1, vr0
vbsrl.v vr3, vr2, 4
vinsgr2vr.w vr3, t0, 3
vor.v vr2, vr3, vr2
vseqi.b vr2, vr2, 0
vmov vr15, vr2
vxor.v vr3, vr3, vr3
vbitsel.v vr3, vr18, vr3, vr2
vld vr0, a1, 4
vld vr1, a1, 20
ld.w t0, a1, 36
vpickev.w vr2, vr1, vr0
vbsrl.v vr4, vr2, 4
vinsgr2vr.w vr4, t0, 3
vseq.b vr2, vr4, vr2
vseqi.b vr2, vr2, 0
vld vr0, a2, 16
vld vr1, a2, 48
vld vr12, a2, 80
vld vr13, a2, 112
vld vr4, a2, 144
vpickev.h vr5, vr1, vr0
vpickev.h vr14, vr13, vr12
vpickev.h vr7, vr4, vr4
vbsrl.v vr6, vr5, 8
vilvl.d vr6, vr14, vr6
vilvh.d vr9, vr7, vr14
vabsd.h vr5, vr6, vr5
vabsd.h vr9, vr9, vr14
vsle.h vr5, vr20, vr5
vsle.h vr9, vr20, vr9
vpickod.h vr6, vr1, vr0
vpickod.h vr14, vr13, vr12
vpickod.h vr7, vr4, vr4
vbsrl.v vr8, vr6, 8
vilvl.d vr8, vr14, vr8
vilvh.d vr7, vr7, vr14
vabsd.h vr8, vr8, vr6
vabsd.h vr7, vr7, vr14
vsle.h vr8, vr21, vr8
vsle.h vr6, vr21, vr7
vor.v vr5, vr5, vr8
vor.v vr6, vr9, vr6
vpickev.b vr6, vr6, vr5
vor.v vr2, vr6, vr2
beqz a5, .bframeiszero_1_lsx
// bframe != 0 ref[1]
vld vr0, a1, 44
vld vr1, a1, 60
ld.w t0, a1, 76
vpickev.w vr0, vr1, vr0
vbsrl.v vr1, vr0, 4
vinsgr2vr.w vr1, t0, 3
vseq.b vr11, vr1, vr0
vseqi.b vr11, vr11, 0
vld vr0, a2, 176
vld vr1, a2, 208
vld vr12, a2, 240
vld vr13, a2, 272
vld vr4, a2, 304
vpickev.h vr5, vr1, vr0
vpickev.h vr14, vr13, vr12
vpickev.h vr7, vr4, vr4
vbsrl.v vr6, vr5, 8
vilvl.d vr6, vr14, vr6
vilvh.d vr9, vr7, vr14
vabsd.h vr5, vr6, vr5
vabsd.h vr9, vr9, vr14
vsle.h vr5, vr20, vr5
vsle.h vr9, vr20, vr9
vpickod.h vr6, vr1, vr0
vpickod.h vr14, vr13, vr12
vpickod.h vr7, vr4, vr4
vbsrl.v vr8, vr6, 8
vilvl.d vr8, vr14, vr8
vilvh.d vr7, vr7, vr14
vabsd.h vr8, vr8, vr6
vabsd.h vr6, vr7, vr14
vsle.h vr8, vr21, vr8
vsle.h vr6, vr21, vr6
vor.v vr5, vr5, vr8
vor.v vr7, vr9, vr6
vpickev.b vr6, vr7, vr5
vor.v vr6, vr6, vr11
vor.v vr2, vr6, vr2
.bframeiszero_1_lsx:
vxor.v vr22, vr22, vr22
vbitsel.v vr22, vr22, vr19, vr2
vbitsel.v vr22, vr3, vr22, vr15
vst vr22, a3, 32
endfunc_x264
/*
* void deblock_v_luma_intra_lsx( pixel *pix, intptr_t stride, int alpha, int beta )
*/
function_x264 deblock_v_luma_intra_lsx
slli.d t0, a1, 1
add.d t1, t0, a1
slli.d t2, a1, 2
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
// Load data from pix
sub.d t3, a0, t2 // t3 = a0 - 4 * stride
vld vr3, t3, 0 // p3
vldx vr2, t3, a1 // p2
vldx vr1, t3, t0 // p1
vldx vr0, t3, t1 // p0
vld vr10, a0, 0 // q0
vldx vr11, a0, a1 // q1
vldx vr12, a0, t0 // q2
vldx vr13, a0, t1 // q3
vsllwil.hu.bu vr7, vr3, 0
vsllwil.hu.bu vr6, vr2, 0
vsllwil.hu.bu vr5, vr1, 0
vsllwil.hu.bu vr4, vr0, 0
vsllwil.hu.bu vr14, vr10, 0
vsllwil.hu.bu vr15, vr11, 0
vsllwil.hu.bu vr16, vr12, 0
vsllwil.hu.bu vr17, vr13, 0
/* p0', p1', p2' */
vadd.h vr8, vr5, vr4
vadd.h vr9, vr8, vr14
vadd.h vr19, vr7, vr6
vadd.h vr18, vr6, vr9 // pix[-2*xstride]
vslli.h vr19, vr19, 1
vadd.h vr20, vr9, vr18
vadd.h vr19, vr19, vr18 // pix[-3*xstride]
vadd.h vr20, vr20, vr15 // pix[-1*xstride]
/* p0' */
vadd.h vr8, vr8, vr15
vadd.h vr21, vr8, vr5 // pix[-1*xstride]
// /* q0', q1', q2' */
vadd.h vr8, vr15, vr14
vadd.h vr9, vr8, vr4
vadd.h vr23, vr17, vr16
vadd.h vr22, vr9, vr16 // pix[1*xstride]
vslli.h vr23, vr23, 1
vadd.h vr24, vr9, vr22
vadd.h vr23, vr23, vr22 // pix[2*xstride]
vadd.h vr24, vr24, vr5 // pix[0*xstride]
/* q0' */
vadd.h vr8, vr8, vr5
vadd.h vr25, vr8, vr15 // pix[0*xstride]
vexth.hu.bu vr7, vr3
vexth.hu.bu vr6, vr2
vexth.hu.bu vr5, vr1
vexth.hu.bu vr4, vr0
vexth.hu.bu vr14, vr10
vexth.hu.bu vr15, vr11
vexth.hu.bu vr16, vr12
vexth.hu.bu vr17, vr13
/* p0', p1', p2' */
vadd.h vr8, vr5, vr4
vadd.h vr9, vr8, vr14
vadd.h vr27, vr6, vr9 // pix[-2*xstride]
vadd.h vr28, vr7, vr6
vslli.h vr28, vr28, 1
vadd.h vr29, vr9, vr27
vadd.h vr28, vr28, vr27 // pix[-3*xstride]
vadd.h vr29, vr29, vr15 // pix[-1*xstride]
/* p0' */
vadd.h vr8, vr8, vr15
vadd.h vr30, vr8, vr5 // pix[-1*xstride]
/* q0', q1', q2' */
vadd.h vr8, vr15, vr14
vadd.h vr9, vr8, vr4
vadd.h vr3, vr17, vr16
vadd.h vr31, vr9, vr16 // pix[1*xstride]
vslli.h vr3, vr3, 1
vadd.h vr13, vr9, vr31
vadd.h vr3, vr3, vr31 // pix[2*xstride]
vadd.h vr13, vr13, vr5 // pix[0*xstride]
/* q0' */
vadd.h vr8, vr8, vr5
vadd.h vr9, vr8, vr15 // pix[0*xstride]
vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride]
vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride]
vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride]
vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0'
vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride]
vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride]
vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride]
vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0'
vreplgr2vr.b vr18, a2 // alpha
vreplgr2vr.b vr19, a3 // beta
vabsd.bu vr26, vr0, vr10
vabsd.bu vr8, vr1, vr0
vabsd.bu vr16, vr11, vr10
vslt.bu vr20, vr26, vr18
vslt.bu vr21, vr8, vr19
vslt.bu vr22, vr16, vr19
vand.v vr20, vr20, vr21
vand.v vr20, vr20, vr22 // if_1
vsrli.b vr18, vr18, 2
vaddi.bu vr18, vr18, 2
vslt.bu vr26, vr26, vr18 // if_2
vabsd.bu vr23, vr2, vr0
vslt.bu vr23, vr23, vr19 // if_3
vand.v vr16, vr23, vr26 // if_2 && if_3
vnor.v vr24, vr16, vr16 // !(if_2 && if_3)
vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3)
vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3
vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride]
vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride]
vbitsel.v vr6, vr0, vr30, vr24
vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride]
vabsd.bu vr7, vr12, vr10
vslt.bu vr7, vr7, vr19 // if_4
vand.v vr17, vr7, vr26 // if_2 && if_4
vnor.v vr14, vr17, vr17 // !(if_2 && if_4)
vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4)
vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4
vbitsel.v vr15, vr10, vr9, vr14
vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride]
vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride]
vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride]
vstx vr4, t3, a1
vstx vr5, t3, t0
vstx vr6, t3, t1
vst vr15, a0, 0
vstx vr9, a0, a1
vstx vr13, a0, t0
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
/*
* void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
*/
function_x264 deblock_h_luma_intra_lsx
slli.d t0, a1, 1
slli.d t2, a1, 2
addi.d t5, a0, -4
add.d t1, t0, a1
// Store registers to the stack
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
// Load data from pix
FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23
add.d t5, t5, t2
FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27
vilvl.b vr11, vr11, vr10
vilvl.b vr13, vr13, vr12
vilvl.b vr15, vr15, vr14
vilvl.b vr17, vr17, vr16
vilvl.h vr0, vr13, vr11
vilvl.h vr1, vr17, vr15
vilvh.h vr2, vr13, vr11
vilvh.h vr3, vr17, vr15
vilvl.w vr4, vr1, vr0
vilvl.w vr6, vr3, vr2
vilvh.w vr5, vr1, vr0
vilvh.w vr7, vr3, vr2
vilvl.b vr11, vr21, vr20
vilvl.b vr13, vr23, vr22
vilvl.b vr15, vr25, vr24
vilvl.b vr17, vr27, vr26
vilvl.h vr0, vr13, vr11
vilvl.h vr1, vr17, vr15
vilvh.h vr2, vr13, vr11
vilvh.h vr3, vr17, vr15
vilvl.w vr24, vr1, vr0
vilvl.w vr26, vr3, vr2
vilvh.w vr25, vr1, vr0
vilvh.w vr27, vr3, vr2
vilvl.d vr3, vr24, vr4 // p3
vilvh.d vr2, vr24, vr4 // p2
vilvl.d vr1, vr25, vr5 // p1
vilvh.d vr0, vr25, vr5 // p0
vilvl.d vr10, vr26, vr6 // q0
vilvh.d vr11, vr26, vr6 // q1
vilvl.d vr12, vr27, vr7 // q2
vilvh.d vr13, vr27, vr7 // q3
vsllwil.hu.bu vr7, vr3, 0
vsllwil.hu.bu vr6, vr2, 0
vsllwil.hu.bu vr5, vr1, 0
vsllwil.hu.bu vr4, vr0, 0
vsllwil.hu.bu vr14, vr10, 0
vsllwil.hu.bu vr15, vr11, 0
vsllwil.hu.bu vr16, vr12, 0
vsllwil.hu.bu vr17, vr13, 0
/* p0', p1', p2' */
vadd.h vr8, vr5, vr4
vadd.h vr9, vr8, vr14
vadd.h vr19, vr7, vr6
vadd.h vr18, vr6, vr9 // pix[-2*xstride]
vslli.h vr19, vr19, 1
vadd.h vr20, vr9, vr18
vadd.h vr19, vr19, vr18 // pix[-3*xstride]
vadd.h vr20, vr20, vr15 // pix[-1*xstride]
/* p0' */
vadd.h vr8, vr8, vr15
vadd.h vr21, vr8, vr5 // pix[-1*xstride]
/* q0', q1', q2' */
vadd.h vr8, vr15, vr14
vadd.h vr9, vr8, vr4
vadd.h vr23, vr17, vr16
vadd.h vr22, vr9, vr16 // pix[1*xstride]
vslli.h vr23, vr23, 1
vadd.h vr24, vr9, vr22
vadd.h vr23, vr23, vr22 // pix[2*xstride]
vadd.h vr24, vr24, vr5 // pix[0*xstride]
/* q0' */
vadd.h vr8, vr8, vr5
vadd.h vr25, vr8, vr15 // pix[0*xstride]
vexth.hu.bu vr7, vr3
vexth.hu.bu vr6, vr2
vexth.hu.bu vr5, vr1
vexth.hu.bu vr4, vr0
vexth.hu.bu vr14, vr10
vexth.hu.bu vr15, vr11
vexth.hu.bu vr16, vr12
vexth.hu.bu vr17, vr13
/* p0', p1', p2' */
vadd.h vr8, vr5, vr4
vadd.h vr9, vr8, vr14
vadd.h vr27, vr6, vr9 // pix[-2*xstride]
vadd.h vr28, vr7, vr6
vslli.h vr28, vr28, 1
vadd.h vr29, vr9, vr27
vadd.h vr28, vr28, vr27 // pix[-3*xstride]
vadd.h vr29, vr29, vr15 // pix[-1*xstride]
/* p0' */
vadd.h vr8, vr8, vr15
vadd.h vr30, vr8, vr5 // pix[-1*xstride]
/* q0', q1', q2' */
vadd.h vr8, vr15, vr14
vadd.h vr9, vr8, vr4
vadd.h vr3, vr17, vr16
vadd.h vr31, vr9, vr16 // pix[1*xstride]
vslli.h vr3, vr3, 1
vadd.h vr13, vr9, vr31
vadd.h vr3, vr3, vr31 // pix[2*xstride]
vadd.h vr13, vr13, vr5 // pix[0*xstride]
/* q0' */
vadd.h vr8, vr8, vr5
vadd.h vr9, vr8, vr15 // pix[0*xstride]
vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride]
vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride]
vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride]
vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0'
vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride]
vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride]
vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride]
vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0'
vreplgr2vr.b vr18, a2 // alpha
vreplgr2vr.b vr19, a3 // beta
vabsd.bu vr26, vr0, vr10
vabsd.bu vr8, vr1, vr0
vabsd.bu vr16, vr11, vr10
vslt.bu vr20, vr26, vr18
vslt.bu vr21, vr8, vr19
vslt.bu vr22, vr16, vr19
vand.v vr20, vr20, vr21
vand.v vr20, vr20, vr22 // if_1
vsrli.b vr18, vr18, 2
vaddi.bu vr18, vr18, 2
vslt.bu vr26, vr26, vr18 // if_2
vabsd.bu vr23, vr2, vr0
vslt.bu vr23, vr23, vr19 // if_3
vand.v vr16, vr23, vr26 // if_2 && if_3
vnor.v vr24, vr16, vr16 // !(if_2 && if_3)
vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3)
vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3
vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride]
vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride]
vbitsel.v vr6, vr0, vr30, vr24
vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride]
vabsd.bu vr7, vr12, vr10
vslt.bu vr7, vr7, vr19 // if_4
vand.v vr17, vr7, vr26 // if_2 && if_4
vnor.v vr14, vr17, vr17 // !(if_2 && if_4)
vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4)
vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4
vbitsel.v vr15, vr10, vr9, vr14
vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride]
vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride]
vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride]
vilvl.b vr16, vr5, vr4
vilvl.b vr17, vr15, vr6
vilvl.b vr18, vr13, vr9
vilvh.b vr19, vr5, vr4
vilvh.b vr20, vr15, vr6
vilvh.b vr21, vr13, vr9
vilvl.h vr0, vr17, vr16
vilvh.h vr1, vr17, vr16
vilvl.h vr2, vr20, vr19
vilvh.h vr3, vr20, vr19
addi.d t6, a0, -3 // t6 = a0 -3
vstelm.w vr0, t6, 0, 0
vstelm.h vr18, t6, 4, 0
add.d t6, t6, a1
vstelm.w vr0, t6, 0, 1
vstelm.h vr18, t6, 4, 1
add.d t6, t6, a1
vstelm.w vr0, t6, 0, 2
vstelm.h vr18, t6, 4, 2
add.d t6, t6, a1
vstelm.w vr0, t6, 0, 3
vstelm.h vr18, t6, 4, 3
add.d t6, t6, a1
vstelm.w vr1, t6, 0, 0
vstelm.h vr18, t6, 4, 4
add.d t6, t6, a1
vstelm.w vr1, t6, 0, 1
vstelm.h vr18, t6, 4, 5
add.d t6, t6, a1
vstelm.w vr1, t6, 0, 2
vstelm.h vr18, t6, 4, 6
add.d t6, t6, a1
vstelm.w vr1, t6, 0, 3
vstelm.h vr18, t6, 4, 7
add.d t6, t6, a1
vstelm.w vr2, t6, 0, 0
vstelm.h vr21, t6, 4, 0
add.d t6, t6, a1
vstelm.w vr2, t6, 0, 1
vstelm.h vr21, t6, 4, 1
add.d t6, t6, a1
vstelm.w vr2, t6, 0, 2
vstelm.h vr21, t6, 4, 2
add.d t6, t6, a1
vstelm.w vr2, t6, 0, 3
vstelm.h vr21, t6, 4, 3
add.d t6, t6, a1
vstelm.w vr3, t6, 0, 0
vstelm.h vr21, t6, 4, 4
add.d t6, t6, a1
vstelm.w vr3, t6, 0, 1
vstelm.h vr21, t6, 4, 5
add.d t6, t6, a1
vstelm.w vr3, t6, 0, 2
vstelm.h vr21, t6, 4, 6
add.d t6, t6, a1
vstelm.w vr3, t6, 0, 3
vstelm.h vr21, t6, 4, 7
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc_x264
#endif /* !HIGH_BIT_DEPTH */