mirror of https://code.videolan.org/videolan/x264
loongarch: Enhance ultrafast encoding performance
Using the following command, ultrafast encoding has improved from 182fps to 189fps: ./x264 --preset ultrafast -o out.mkv yuv_1920x1080.yuv
This commit is contained in:
parent
162622863a
commit
7ed753b10a
|
@ -984,3 +984,248 @@ function_x264 decimate_score64_lsx
|
|||
jirl $r0, $r1, 0x0
|
||||
.END_SCORE_64_LSX:
|
||||
endfunc_x264
|
||||
|
||||
/*
|
||||
* int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel )
|
||||
*/
|
||||
function_x264 coeff_level_run16_lasx
|
||||
addi.w t0, zero, 15
|
||||
|
||||
xvld xr0, a0, 0
|
||||
xvldi xr2, 1
|
||||
|
||||
xvssrlni.bu.h xr0, xr0, 0
|
||||
xvpermi.d xr1, xr0, 0xd8
|
||||
xvsle.bu xr3, xr2, xr1
|
||||
xvsrlni.b.h xr3, xr3, 4
|
||||
xvpickve2gr.du t8, xr3, 0
|
||||
clz.d t1, t8
|
||||
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
st.w t0, a1, 0x00 // Store runlevel->last
|
||||
addi.d t3, a1, 23
|
||||
nor t2, zero, zero
|
||||
addi.d t2, t2, -15
|
||||
and t3, t3, t2 // runlevel->level
|
||||
xor t4, t4, t4 // mask
|
||||
xor t5, t5, t5 // total: number of non-zero elements
|
||||
addi.w t6, zero, 1 // const 1
|
||||
.LOOP_COEFF_LEVEL_RUN16_LASX:
|
||||
slli.w t7, t0, 1
|
||||
ldx.h t2, a0, t7
|
||||
st.h t2, t3, 0
|
||||
addi.d t3, t3, 2
|
||||
|
||||
addi.w t5, t5, 1
|
||||
sll.w t2, t6, t0
|
||||
or t4, t4, t2
|
||||
bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX
|
||||
|
||||
addi.w t0, t0, -1
|
||||
slli.w t1, t1, 2
|
||||
addi.w t1, t1, 4
|
||||
sll.d t8, t8, t1
|
||||
clz.d t1, t8
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX
|
||||
.END_COEFF_LEVEL_RUN16_LASX:
|
||||
st.w t4, a1, 4
|
||||
move a0, t5
|
||||
endfunc_x264
|
||||
|
||||
function_x264 coeff_level_run15_lasx
|
||||
addi.w t0, zero, 15
|
||||
|
||||
vld vr0, a0, 0
|
||||
vld vr1, a0, 16
|
||||
xvldi xr3, 1
|
||||
|
||||
vinsgr2vr.h vr1, zero, 7
|
||||
xvpermi.q xr1, xr0, 0x20
|
||||
|
||||
xvssrlni.bu.h xr1, xr1, 0
|
||||
xvpermi.d xr2, xr1, 0xd8
|
||||
xvsle.bu xr4, xr3, xr2
|
||||
xvsrlni.b.h xr4, xr4, 4
|
||||
xvpickve2gr.du t8, xr4, 0
|
||||
clz.d t1, t8
|
||||
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
st.w t0, a1, 0x00 // Store runlevel->last
|
||||
addi.d t3, a1, 23
|
||||
nor t2, zero, zero
|
||||
addi.d t2, t2, -15
|
||||
and t3, t3, t2 // runlevel->level
|
||||
xor t4, t4, t4 // mask
|
||||
xor t5, t5, t5 // total: number of non-zero elements
|
||||
addi.w t6, zero, 1 // const 1
|
||||
.LOOP_COEFF_LEVEL_RUN15_LASX:
|
||||
slli.w t7, t0, 1
|
||||
ldx.h t2, a0, t7
|
||||
st.h t2, t3, 0
|
||||
addi.d t3, t3, 2
|
||||
|
||||
addi.w t5, t5, 1
|
||||
sll.w t2, t6, t0
|
||||
or t4, t4, t2
|
||||
bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX
|
||||
|
||||
addi.w t0, t0, -1
|
||||
slli.w t1, t1, 2
|
||||
addi.w t1, t1, 4
|
||||
sll.d t8, t8, t1
|
||||
clz.d t1, t8
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX
|
||||
.END_COEFF_LEVEL_RUN15_LASX:
|
||||
st.w t4, a1, 4
|
||||
move a0, t5
|
||||
endfunc_x264
|
||||
|
||||
function_x264 coeff_level_run16_lsx
|
||||
addi.w t0, zero, 15
|
||||
vld vr0, a0, 0
|
||||
vld vr1, a0, 16
|
||||
vldi vr2, 1
|
||||
|
||||
vssrlni.bu.h vr0, vr0, 0
|
||||
vssrlni.bu.h vr1, vr1, 0
|
||||
vpermi.w vr1, vr0, 0x44
|
||||
vsle.bu vr3, vr2, vr1
|
||||
vsrlni.b.h vr3, vr3, 4
|
||||
vpickve2gr.du t8, vr3, 0
|
||||
clz.d t1, t8
|
||||
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
st.w t0, a1, 0x00 // Store runlevel->last
|
||||
addi.d t3, a1, 23
|
||||
nor t2, zero, zero
|
||||
addi.d t2, t2, -15
|
||||
and t3, t3, t2 // runlevel->level
|
||||
xor t4, t4, t4 // mask
|
||||
xor t5, t5, t5 // total: number of non-zero elements
|
||||
addi.w t6, zero, 1 // const 1
|
||||
.LOOP_COEFF_LEVEL_RUN16_LSX:
|
||||
slli.w t7, t0, 1
|
||||
ldx.h t2, a0, t7
|
||||
st.h t2, t3, 0
|
||||
addi.d t3, t3, 2
|
||||
|
||||
addi.w t5, t5, 1
|
||||
sll.w t2, t6, t0
|
||||
or t4, t4, t2
|
||||
bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX
|
||||
|
||||
addi.w t0, t0, -1
|
||||
slli.w t1, t1, 2
|
||||
addi.w t1, t1, 4
|
||||
sll.d t8, t8, t1
|
||||
clz.d t1, t8
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX
|
||||
.END_COEFF_LEVEL_RUN16_LSX:
|
||||
st.w t4, a1, 4
|
||||
move a0, t5
|
||||
endfunc_x264
|
||||
|
||||
function_x264 coeff_level_run15_lsx
|
||||
addi.w t0, zero, 15
|
||||
vld vr0, a0, 0
|
||||
vld vr1, a0, 16
|
||||
vldi vr2, 1
|
||||
vinsgr2vr.h vr1, zero, 7
|
||||
|
||||
vssrlni.bu.h vr0, vr0, 0
|
||||
vssrlni.bu.h vr1, vr1, 0
|
||||
vpermi.w vr1, vr0, 0x44
|
||||
vsle.bu vr3, vr2, vr1
|
||||
vsrlni.b.h vr3, vr3, 4
|
||||
vpickve2gr.du t8, vr3, 0
|
||||
clz.d t1, t8
|
||||
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
st.w t0, a1, 0x00 // Store runlevel->last
|
||||
addi.d t3, a1, 23
|
||||
nor t2, zero, zero
|
||||
addi.d t2, t2, -15
|
||||
and t3, t3, t2 // runlevel->level
|
||||
xor t4, t4, t4 // mask
|
||||
xor t5, t5, t5 // total: number of non-zero elements
|
||||
addi.w t6, zero, 1 // const 1
|
||||
.LOOP_COEFF_LEVEL_RUN15_LSX:
|
||||
slli.w t7, t0, 1
|
||||
ldx.h t2, a0, t7
|
||||
st.h t2, t3, 0
|
||||
addi.d t3, t3, 2
|
||||
|
||||
addi.w t5, t5, 1
|
||||
sll.w t2, t6, t0
|
||||
or t4, t4, t2
|
||||
bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX
|
||||
|
||||
addi.w t0, t0, -1
|
||||
slli.w t1, t1, 2
|
||||
addi.w t1, t1, 4
|
||||
sll.d t8, t8, t1
|
||||
clz.d t1, t8
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX
|
||||
.END_COEFF_LEVEL_RUN15_LSX:
|
||||
st.w t4, a1, 4
|
||||
move a0, t5
|
||||
endfunc_x264
|
||||
|
||||
function_x264 coeff_level_run8_lsx
|
||||
addi.w t0, zero, 15
|
||||
vld vr0, a0, 0
|
||||
vxor.v vr1, vr1, vr1
|
||||
vldi vr2, 1
|
||||
|
||||
vssrlni.bu.h vr0, vr0, 0
|
||||
vpermi.w vr1, vr0, 0x44
|
||||
vsle.bu vr3, vr2, vr1
|
||||
vsrlni.b.h vr3, vr3, 4
|
||||
vpickve2gr.du t8, vr3, 0
|
||||
clz.d t1, t8
|
||||
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
st.w t0, a1, 0x00 // Store runlevel->last
|
||||
addi.d t3, a1, 23
|
||||
nor t2, zero, zero
|
||||
addi.d t2, t2, -15
|
||||
and t3, t3, t2 // runlevel->level
|
||||
xor t4, t4, t4 // mask
|
||||
xor t5, t5, t5 // total: number of non-zero elements
|
||||
addi.w t6, zero, 1 // const 1
|
||||
.LOOP_COEFF_LEVEL_RUN8_LSX:
|
||||
slli.w t7, t0, 1
|
||||
ldx.h t2, a0, t7
|
||||
st.h t2, t3, 0
|
||||
addi.d t3, t3, 2
|
||||
|
||||
addi.w t5, t5, 1
|
||||
sll.w t2, t6, t0
|
||||
or t4, t4, t2
|
||||
bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX
|
||||
|
||||
addi.w t0, t0, -1
|
||||
slli.w t1, t1, 2
|
||||
addi.w t1, t1, 4
|
||||
sll.d t8, t8, t1
|
||||
clz.d t1, t8
|
||||
srai.w t1, t1, 2
|
||||
sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit
|
||||
bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX
|
||||
.END_COEFF_LEVEL_RUN8_LSX:
|
||||
st.w t4, a1, 4
|
||||
move a0, t5
|
||||
endfunc_x264
|
||||
|
|
|
@ -81,4 +81,16 @@ void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
|||
#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
|
||||
void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
|
||||
#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
|
||||
int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
|
||||
#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
|
||||
int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
|
||||
|
||||
#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
|
||||
int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
|
||||
#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
|
||||
int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
|
||||
#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
|
||||
int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
|
||||
|
||||
#endif/* X264_LOONGARCH_QUANT_H */
|
||||
|
|
|
@ -848,11 +848,17 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
|
|||
pf->dequant_4x4 = x264_dequant_4x4_lsx;
|
||||
pf->dequant_8x8 = x264_dequant_8x8_lsx;
|
||||
pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx;
|
||||
pf->coeff_last4 = x264_coeff_last4_lsx;
|
||||
pf->coeff_last8 = x264_coeff_last8_lsx;
|
||||
pf->decimate_score15 = x264_decimate_score15_lsx;
|
||||
pf->decimate_score16 = x264_decimate_score16_lsx;
|
||||
pf->decimate_score64 = x264_decimate_score64_lsx;
|
||||
pf->coeff_last4 = x264_coeff_last4_lsx;
|
||||
pf->coeff_last8 = x264_coeff_last8_lsx;
|
||||
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx;
|
||||
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx;
|
||||
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx;
|
||||
pf->coeff_level_run8 = x264_coeff_level_run8_lsx;
|
||||
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx;
|
||||
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx;
|
||||
}
|
||||
if( cpu&X264_CPU_LASX )
|
||||
{
|
||||
|
@ -863,6 +869,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
|
|||
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx;
|
||||
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx;
|
||||
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx;
|
||||
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx;
|
||||
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Reference in New Issue