From 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Fri, 14 Jan 2011 21:34:25 +0000 Subject: [PATCH] H.264: split luma dc idct out and implement MMX/SSE2 versions About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.h | 4 + libavcodec/h264.c | 50 ++---------- libavcodec/h264.h | 5 +- libavcodec/h264_cabac.c | 8 +- libavcodec/h264_cavlc.c | 8 +- libavcodec/h264dsp.c | 1 + libavcodec/h264dsp.h | 2 + libavcodec/h264idct.c | 35 ++++++++ libavcodec/svq3.c | 20 ++--- libavcodec/x86/dsputil_mmx.c | 1 + libavcodec/x86/h264_idct.asm | 154 +++++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 4 + 12 files changed, 227 insertions(+), 65 deletions(-) diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 6c56a65885..0efbad918a 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); +void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); +void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); + void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); void ff_float_to_int16_c(int16_t *dst, const float *src, long len); diff --git a/libavcodec/h264.c b/libavcodec/h264.c index b11d947b77..f3470474ea 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ return 0; } -/** - * IDCT transforms the 16 dc values and dequantizes them. - * @param qp quantization parameter - */ -static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){ -#define stride 16 - int i; - int temp[16]; //FIXME check if this is a good idea - static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; - static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; - -//memset(block, 64, 2*256); -//return; - for(i=0; i<4; i++){ - const int offset= y_offset[i]; - const int z0= block[offset+stride*0] + block[offset+stride*4]; - const int z1= block[offset+stride*0] - block[offset+stride*4]; - const int z2= block[offset+stride*1] - block[offset+stride*5]; - const int z3= block[offset+stride*1] + block[offset+stride*5]; - - temp[4*i+0]= z0+z3; - temp[4*i+1]= z1+z2; - temp[4*i+2]= z1-z2; - temp[4*i+3]= z0-z3; - } - - for(i=0; i<4; i++){ - const int offset= x_offset[i]; - const int z0= temp[4*0+i] + temp[4*2+i]; - const int z1= temp[4*0+i] - temp[4*2+i]; - const int z2= temp[4*1+i] - temp[4*3+i]; - const int z3= temp[4*1+i] + temp[4*3+i]; - - block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual - block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); - block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); - block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); - } -} - #if 0 /** * DCT transforms the 16 dc values. @@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); if(is_h264){ if(!transform_bypass) - h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]); + h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); + else{ + static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16, + 8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16}; + for(i = 0; i < 16; i++) + h->mb[dc_mapping[i]] = h->mb_luma_dc[i]; + } }else - ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); + ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); } if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 7158d9748e..189864bb38 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -406,6 +406,7 @@ typedef struct H264Context{ GetBitContext *inter_gb_ptr; DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; + DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb /** @@ -600,10 +601,6 @@ typedef struct H264Context{ extern const uint8_t ff_h264_chroma_qp[52]; -void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); - -void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); - /** * Decode SEI */ diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c index 485837879d..971af37114 100644 --- a/libavcodec/h264_cabac.c +++ b/libavcodec/h264_cabac.c @@ -1597,17 +1597,15 @@ decode_intra_mb: s->current_picture.mb_type[mb_xy]= mb_type; if( cbp || IS_INTRA16x16( mb_type ) ) { - const uint8_t *scan, *scan8x8, *dc_scan; + const uint8_t *scan, *scan8x8; const uint32_t *qmul; if(IS_INTERLACED(mb_type)){ scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0; scan= s->qscale ? h->field_scan : h->field_scan_q0; - dc_scan= luma_dc_field_scan; }else{ scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0; scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; - dc_scan= luma_dc_zigzag_scan; } // decode_cabac_mb_dqp @@ -1642,7 +1640,9 @@ decode_intra_mb: if( IS_INTRA16x16( mb_type ) ) { int i; //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); - decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16); + AV_ZERO128(h->mb_luma_dc+0); + AV_ZERO128(h->mb_luma_dc+8); + decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16); if( cbp&15 ) { qmul = h->dequant4_coeff[0][s->qscale]; diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c index 6f3bcad782..426a285570 100644 --- a/libavcodec/h264_cavlc.c +++ b/libavcodec/h264_cavlc.c @@ -911,16 +911,14 @@ decode_intra_mb: int i8x8, i4x4, chroma_idx; int dquant; GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; - const uint8_t *scan, *scan8x8, *dc_scan; + const uint8_t *scan, *scan8x8; if(IS_INTERLACED(mb_type)){ scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0; scan= s->qscale ? h->field_scan : h->field_scan_q0; - dc_scan= luma_dc_field_scan; }else{ scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0; scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; - dc_scan= luma_dc_zigzag_scan; } dquant= get_se_golomb(&s->gb); @@ -939,7 +937,9 @@ decode_intra_mb: h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale); h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale); if(IS_INTRA16x16(mb_type)){ - if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ + AV_ZERO128(h->mb_luma_dc+0); + AV_ZERO128(h->mb_luma_dc+8); + if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ return -1; //FIXME continue if partitioned and other return -1 too } diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index c01fc77c00..c3ee06d9df 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c) c->h264_idct8_add4 = ff_h264_idct8_add4_c; c->h264_idct_add8 = ff_h264_idct_add8_c; c->h264_idct_add16intra= ff_h264_idct_add16intra_c; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index 3d32a9c18f..9f16480d56 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -65,11 +65,13 @@ typedef struct H264DSPContext{ void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); + void (*h264_dct)(DCTELEM block[4][4]); void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); + void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); }H264DSPContext; void ff_h264dsp_init(H264DSPContext *c); diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c index 86c5ef2559..f5b05ac24f 100644 --- a/libavcodec/h264idct.c +++ b/libavcodec/h264idct.c @@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); } } +/** + * IDCT transforms the 16 dc values and dequantizes them. + * @param qp quantization parameter + */ +void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ +#define stride 16 + int i; + int temp[16]; + static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride}; + + for(i=0; i<4; i++){ + const int z0= input[4*i+0] + input[4*i+1]; + const int z1= input[4*i+0] - input[4*i+1]; + const int z2= input[4*i+2] - input[4*i+3]; + const int z3= input[4*i+2] + input[4*i+3]; + + temp[4*i+0]= z0+z3; + temp[4*i+1]= z0-z3; + temp[4*i+2]= z1-z2; + temp[4*i+3]= z1+z2; + } + + for(i=0; i<4; i++){ + const int offset= x_offset[i]; + const int z0= temp[4*0+i] + temp[4*2+i]; + const int z1= temp[4*0+i] - temp[4*2+i]; + const int z2= temp[4*1+i] - temp[4*3+i]; + const int z3= temp[4*1+i] + temp[4*3+i]; + + output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); + output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); + output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); + output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); + } +} diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c index 4a4a1c52cd..6d10fc5ae5 100644 --- a/libavcodec/svq3.c +++ b/libavcodec/svq3.c @@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = { }; -void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) +void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp) { const int qmul = svq3_dequant_coeff[qp]; #define stride 16 int i; int temp[16]; static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride}; - static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride}; for (i = 0; i < 4; i++){ - const int offset = y_offset[i]; - const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]); - const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]); - const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5]; - const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5]; + const int z0= 13*(input[4*i+0] + input[4*i+1]); + const int z1= 13*(input[4*i+0] - input[4*i+1]); + const int z2= 7* input[4*i+2] - 17*input[4*i+3]; + const int z3= 17* input[4*i+2] + 7*input[4*i+3]; temp[4*i+0] = z0+z3; temp[4*i+1] = z1+z2; @@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i]; const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i]; - block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; - block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; - block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; - block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; + output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; + output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; + output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; + output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; } } #undef stride diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 909ec414e7..375a4c5e09 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 9c154f80b3..fdb35003a8 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 %endif cextern pw_32 +cextern pw_1 SECTION .text @@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8 add8_sse2_cycle 2, 0x21 add8_sse2_cycle 3, 0x29 RET + +;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) + +%macro WALSH4_1D 5 + SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 + SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 + SWAP %1, %4, %3 +%endmacro + +%macro DEQUANT_MMX 3 + mova m7, [pw_1] + mova m4, %1 + punpcklwd %1, m7 + punpckhwd m4, m7 + mova m5, %2 + punpcklwd %2, m7 + punpckhwd m5, m7 + movd m7, t3d + punpckldq m7, m7 + pmaddwd %1, m7 + pmaddwd %2, m7 + pmaddwd m4, m7 + pmaddwd m5, m7 + psrad %1, %3 + psrad %2, %3 + psrad m4, %3 + psrad m5, %3 + packssdw %1, m4 + packssdw %2, m5 +%endmacro + +%macro STORE_WORDS_MMX 5 + movd t0d, %1 + psrlq %1, 32 + movd t1d, %1 + mov [t2+%2*32], t0w + mov [t2+%4*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%3*32], t0w + mov [t2+%5*32], t1w +%endmacro + +%macro DEQUANT_STORE_MMX 1 + DEQUANT_MMX m0, m1, %1 + STORE_WORDS_MMX m0, 0, 1, 4, 5 + STORE_WORDS_MMX m1, 2, 3, 6, 7 + + DEQUANT_MMX m2, m3, %1 + STORE_WORDS_MMX m2, 8, 9, 12, 13 + STORE_WORDS_MMX m3, 10, 11, 14, 15 +%endmacro + +%macro STORE_WORDS_SSE 9 + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 + psrldq %1, 4 + mov [t2+%2*32], t0w + mov [t2+%4*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%3*32], t0w + mov [t2+%5*32], t1w + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 + mov [t2+%6*32], t0w + mov [t2+%8*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%7*32], t0w + mov [t2+%9*32], t1w +%endmacro + +%macro DEQUANT_STORE_SSE2 1 + movd xmm4, t3d + movq xmm5, [pw_1] + pshufd xmm4, xmm4, 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 + punpcklwd xmm0, xmm5 + punpcklwd xmm1, xmm5 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm5 + pmaddwd xmm0, xmm4 + pmaddwd xmm1, xmm4 + pmaddwd xmm2, xmm4 + pmaddwd xmm3, xmm4 + psrad xmm0, %1 + psrad xmm1, %1 + psrad xmm2, %1 + psrad xmm3, %1 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 +%endmacro + +%macro IDCT_DC_DEQUANT 2 +cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 + movq m3, [r1+24] + movq m2, [r1+16] + movq m1, [r1+ 8] + movq m0, [r1+ 0] + WALSH4_1D 0,1,2,3,4 + TRANSPOSE4x4W 0,1,2,3,4 + WALSH4_1D 0,1,2,3,4 + +; shift, tmp, output, qmul +%ifdef WIN64 + DECLARE_REG_TMP 0,3,1,2 + ; we can't avoid this, because r0 is the shift register (ecx) on win64 + xchg r0, t2 +%elifdef ARCH_X86_64 + DECLARE_REG_TMP 3,1,0,2 +%else + DECLARE_REG_TMP 1,3,0,2 +%endif + + cmp t3d, 32767 + jg .big_qmul + add t3d, 128 << 16 +%ifidn %1,mmx + DEQUANT_STORE_MMX 8 +%else + DEQUANT_STORE_SSE2 8 +%endif + RET +.big_qmul: + bsr t0d, t3d + add t3d, 128 << 16 + mov t1d, 7 + cmp t0d, t1d + cmovg t0d, t1d + inc t1d + shr t3d, t0b + sub t1d, t0d +%ifidn %1,mmx + movd m6, t1d + DEQUANT_STORE_MMX m6 +%else + movd xmm6, t1d + DEQUANT_STORE_SSE2 xmm6 +%endif + RET +%endmacro + +INIT_MMX +IDCT_DC_DEQUANT mmx, 0 +IDCT_DC_DEQUANT sse2, 7 diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 401a488cb5..d9e45f8b03 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM int stride, const uint8_t nnzc[6*8]); void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); /***********************************/ /* deblocking */ @@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; c->h264_idct_add8 = ff_h264_idct_add8_mmx; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; if (mm_flags & AV_CPU_FLAG_MMX2) { c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; @@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) if (mm_flags&AV_CPU_FLAG_SSE2) { c->h264_idct8_add = ff_h264_idct8_add_sse2; c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;