loongarch: Enhance ultrafast encoding performance

Using the following command, ultrafast encoding
has improved from 182fps to 189fps:
./x264 --preset ultrafast -o out.mkv yuv_1920x1080.yuv
This commit is contained in:
Xiwei Gu 2024-03-05 14:35:43 +08:00 committed by gxw
parent 162622863a
commit 7ed753b10a
3 changed files with 267 additions and 2 deletions

View File

@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx
jirl $r0, $r1, 0x0
.END_SCORE_64_LSX:
endfunc_x264
/*
* int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
*/
function_x264 coeff_level_run16_lasx
addi.w t0, zero, 15
xvld xr0, a0, 0
xvldi xr2, 1
xvssrlni.bu.h xr0, xr0, 0
xvpermi.d xr1, xr0, 0xd8
xvsle.bu xr3, xr2, xr1
xvsrlni.b.h xr3, xr3, 4
xvpickve2gr.du t8, xr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN16_LASX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX
.END_COEFF_LEVEL_RUN16_LASX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run15_lasx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
xvldi xr3, 1
vinsgr2vr.h vr1, zero, 7
xvpermi.q xr1, xr0, 0x20
xvssrlni.bu.h xr1, xr1, 0
xvpermi.d xr2, xr1, 0xd8
xvsle.bu xr4, xr3, xr2
xvsrlni.b.h xr4, xr4, 4
xvpickve2gr.du t8, xr4, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN15_LASX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX
.END_COEFF_LEVEL_RUN15_LASX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run16_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN16_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX
.END_COEFF_LEVEL_RUN16_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run15_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vld vr1, a0, 16
vldi vr2, 1
vinsgr2vr.h vr1, zero, 7
vssrlni.bu.h vr0, vr0, 0
vssrlni.bu.h vr1, vr1, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN15_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX
.END_COEFF_LEVEL_RUN15_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264
function_x264 coeff_level_run8_lsx
addi.w t0, zero, 15
vld vr0, a0, 0
vxor.v vr1, vr1, vr1
vldi vr2, 1
vssrlni.bu.h vr0, vr0, 0
vpermi.w vr1, vr0, 0x44
vsle.bu vr3, vr2, vr1
vsrlni.b.h vr3, vr3, 4
vpickve2gr.du t8, vr3, 0
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
st.w t0, a1, 0x00 // Store runlevel->last
addi.d t3, a1, 23
nor t2, zero, zero
addi.d t2, t2, -15
and t3, t3, t2 // runlevel->level
xor t4, t4, t4 // mask
xor t5, t5, t5 // total: number of non-zero elements
addi.w t6, zero, 1 // const 1
.LOOP_COEFF_LEVEL_RUN8_LSX:
slli.w t7, t0, 1
ldx.h t2, a0, t7
st.h t2, t3, 0
addi.d t3, t3, 2
addi.w t5, t5, 1
sll.w t2, t6, t0
or t4, t4, t2
bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX
addi.w t0, t0, -1
slli.w t1, t1, 2
addi.w t1, t1, 4
sll.d t8, t8, t1
clz.d t1, t8
srai.w t1, t1, 2
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX
.END_COEFF_LEVEL_RUN8_LSX:
st.w t4, a1, 4
move a0, t5
endfunc_x264

View File

@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
#endif/* X264_LOONGARCH_QUANT_H */

View File

@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_lsx;
pf->dequant_8x8 = x264_dequant_8x8_lsx;
pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx;
pf->coeff_last4 = x264_coeff_last4_lsx;
pf->coeff_last8 = x264_coeff_last8_lsx;
pf->decimate_score15 = x264_decimate_score15_lsx;
pf->decimate_score16 = x264_decimate_score16_lsx;
pf->decimate_score64 = x264_decimate_score64_lsx;
pf->coeff_last4 = x264_coeff_last4_lsx;
pf->coeff_last8 = x264_coeff_last8_lsx;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx;
pf->coeff_level_run8 = x264_coeff_level_run8_lsx;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx;
}
if( cpu&X264_CPU_LASX )
{
@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx;
}
#endif