From 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 14 Jan 2011 21:34:25 +0000
Subject: [PATCH] H.264: split luma dc idct out and implement MMX/SSE2 versions
 About 2.5x the speed.

NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.

Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/dsputil.h         |   4 +
 libavcodec/h264.c            |  50 ++----------
 libavcodec/h264.h            |   5 +-
 libavcodec/h264_cabac.c      |   8 +-
 libavcodec/h264_cavlc.c      |   8 +-
 libavcodec/h264dsp.c         |   1 +
 libavcodec/h264dsp.h         |   2 +
 libavcodec/h264idct.c        |  35 ++++++++
 libavcodec/svq3.c            |  20 ++---
 libavcodec/x86/dsputil_mmx.c |   1 +
 libavcodec/x86/h264_idct.asm | 154 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/h264dsp_mmx.c |   4 +
 12 files changed, 227 insertions(+), 65 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 6c56a65885..0efbad918a 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
 void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 
+void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
+void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
+void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
+
 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                              const float *win, float add_bias, int len);
 void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index b11d947b77..f3470474ea 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
     return 0;
 }
 
-/**
- * IDCT transforms the 16 dc values and dequantizes them.
- * @param qp quantization parameter
- */
-static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
-#define stride 16
-    int i;
-    int temp[16]; //FIXME check if this is a good idea
-    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
-    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
-
-//memset(block, 64, 2*256);
-//return;
-    for(i=0; i<4; i++){
-        const int offset= y_offset[i];
-        const int z0= block[offset+stride*0] + block[offset+stride*4];
-        const int z1= block[offset+stride*0] - block[offset+stride*4];
-        const int z2= block[offset+stride*1] - block[offset+stride*5];
-        const int z3= block[offset+stride*1] + block[offset+stride*5];
-
-        temp[4*i+0]= z0+z3;
-        temp[4*i+1]= z1+z2;
-        temp[4*i+2]= z1-z2;
-        temp[4*i+3]= z0-z3;
-    }
-
-    for(i=0; i<4; i++){
-        const int offset= x_offset[i];
-        const int z0= temp[4*0+i] + temp[4*2+i];
-        const int z1= temp[4*0+i] - temp[4*2+i];
-        const int z2= temp[4*1+i] - temp[4*3+i];
-        const int z3= temp[4*1+i] + temp[4*3+i];
-
-        block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
-        block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
-        block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
-        block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
-    }
-}
-
 #if 0
 /**
  * DCT transforms the 16 dc values.
@@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
                 if(is_h264){
                     if(!transform_bypass)
-                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
+                        h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
+                    else{
+                        static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
+                                                                8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
+                        for(i = 0; i < 16; i++)
+                            h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
+                    }
                 }else
-                    ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
+                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
             }
             if(h->deblocking_filter)
                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index 7158d9748e..189864bb38 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -406,6 +406,7 @@ typedef struct H264Context{
     GetBitContext *inter_gb_ptr;
 
     DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
+    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
     DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
 
     /**
@@ -600,10 +601,6 @@ typedef struct H264Context{
 
 extern const uint8_t ff_h264_chroma_qp[52];
 
-void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
-
-void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
-
 /**
  * Decode SEI
  */
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 485837879d..971af37114 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1597,17 +1597,15 @@ decode_intra_mb:
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *scan8x8, *dc_scan;
+        const uint8_t *scan, *scan8x8;
         const uint32_t *qmul;
 
         if(IS_INTERLACED(mb_type)){
             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
             scan= s->qscale ? h->field_scan : h->field_scan_q0;
-            dc_scan= luma_dc_field_scan;
         }else{
             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
-            dc_scan= luma_dc_zigzag_scan;
         }
 
         // decode_cabac_mb_dqp
@@ -1642,7 +1640,9 @@ decode_intra_mb:
         if( IS_INTRA16x16( mb_type ) ) {
             int i;
             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-            decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
+            AV_ZERO128(h->mb_luma_dc+0);
+            AV_ZERO128(h->mb_luma_dc+8);
+            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);
 
             if( cbp&15 ) {
                 qmul = h->dequant4_coeff[0][s->qscale];
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 6f3bcad782..426a285570 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -911,16 +911,14 @@ decode_intra_mb:
         int i8x8, i4x4, chroma_idx;
         int dquant;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *scan8x8, *dc_scan;
+        const uint8_t *scan, *scan8x8;
 
         if(IS_INTERLACED(mb_type)){
             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
             scan= s->qscale ? h->field_scan : h->field_scan_q0;
-            dc_scan= luma_dc_field_scan;
         }else{
             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
-            dc_scan= luma_dc_zigzag_scan;
         }
 
         dquant= get_se_golomb(&s->gb);
@@ -939,7 +937,9 @@ decode_intra_mb:
         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
         if(IS_INTRA16x16(mb_type)){
-            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
+            AV_ZERO128(h->mb_luma_dc+0);
+            AV_ZERO128(h->mb_luma_dc+8);
+            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
                 return -1; //FIXME continue if partitioned and other return -1 too
             }
 
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index c01fc77c00..c3ee06d9df 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c)
     c->h264_idct8_add4     = ff_h264_idct8_add4_c;
     c->h264_idct_add8      = ff_h264_idct_add8_c;
     c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
+    c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;
 
     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 3d32a9c18f..9f16480d56 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -65,11 +65,13 @@ typedef struct H264DSPContext{
     void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
+
     void (*h264_dct)(DCTELEM block[4][4]);
     void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
     void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
 }H264DSPContext;
 
 void ff_h264dsp_init(H264DSPContext *c);
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index 86c5ef2559..f5b05ac24f 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
             ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
     }
 }
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qp quantization parameter
+ */
+void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
+#define stride 16
+    int i;
+    int temp[16];
+    static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
+
+    for(i=0; i<4; i++){
+        const int z0= input[4*i+0] + input[4*i+1];
+        const int z1= input[4*i+0] - input[4*i+1];
+        const int z2= input[4*i+2] - input[4*i+3];
+        const int z3= input[4*i+2] + input[4*i+3];
+
+        temp[4*i+0]= z0+z3;
+        temp[4*i+1]= z0-z3;
+        temp[4*i+2]= z1-z2;
+        temp[4*i+3]= z1+z2;
+    }
+
+    for(i=0; i<4; i++){
+        const int offset= x_offset[i];
+        const int z0= temp[4*0+i] + temp[4*2+i];
+        const int z1= temp[4*0+i] - temp[4*2+i];
+        const int z2= temp[4*1+i] - temp[4*3+i];
+        const int z3= temp[4*1+i] + temp[4*3+i];
+
+        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
+        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+    }
+}
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 4a4a1c52cd..6d10fc5ae5 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = {
 };
 
 
-void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
+void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
 {
     const int qmul = svq3_dequant_coeff[qp];
 #define stride 16
     int i;
     int temp[16];
     static const int x_offset[4] = {0, 1*stride, 4* stride,  5*stride};
-    static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
 
     for (i = 0; i < 4; i++){
-        const int offset = y_offset[i];
-        const int z0 = 13*(block[offset+stride*0] +    block[offset+stride*4]);
-        const int z1 = 13*(block[offset+stride*0] -    block[offset+stride*4]);
-        const int z2 =  7* block[offset+stride*1] - 17*block[offset+stride*5];
-        const int z3 = 17* block[offset+stride*1] +  7*block[offset+stride*5];
+        const int z0= 13*(input[4*i+0] +    input[4*i+1]);
+        const int z1= 13*(input[4*i+0] -    input[4*i+1]);
+        const int z2=  7* input[4*i+2] - 17*input[4*i+3];
+        const int z3= 17* input[4*i+2] +  7*input[4*i+3];
 
         temp[4*i+0] = z0+z3;
         temp[4*i+1] = z1+z2;
@@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
         const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
         const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
 
-        block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
-        block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
-        block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
-        block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
+        output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
+        output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
+        output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
+        output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
     }
 }
 #undef stride
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 909ec414e7..375a4c5e09 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -41,6 +41,7 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
 {0x8000000080000000ULL, 0x8000000080000000ULL};
 
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
 DECLARE_ALIGNED(8,  const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 9c154f80b3..fdb35003a8 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
 %endif
 
 cextern pw_32
+cextern pw_1
 
 SECTION .text
 
@@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
     add8_sse2_cycle 2, 0x21
     add8_sse2_cycle 3, 0x29
     RET
+
+;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
+
+%macro WALSH4_1D 5
+    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
+    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
+    SWAP %1, %4, %3
+%endmacro
+
+%macro DEQUANT_MMX 3
+    mova        m7, [pw_1]
+    mova        m4, %1
+    punpcklwd   %1, m7
+    punpckhwd   m4, m7
+    mova        m5, %2
+    punpcklwd   %2, m7
+    punpckhwd   m5, m7
+    movd        m7, t3d
+    punpckldq   m7, m7
+    pmaddwd     %1, m7
+    pmaddwd     %2, m7
+    pmaddwd     m4, m7
+    pmaddwd     m5, m7
+    psrad       %1, %3
+    psrad       %2, %3
+    psrad       m4, %3
+    psrad       m5, %3
+    packssdw    %1, m4
+    packssdw    %2, m5
+%endmacro
+
+%macro STORE_WORDS_MMX 5
+    movd  t0d, %1
+    psrlq  %1, 32
+    movd  t1d, %1
+    mov [t2+%2*32], t0w
+    mov [t2+%4*32], t1w
+    shr   t0d, 16
+    shr   t1d, 16
+    mov [t2+%3*32], t0w
+    mov [t2+%5*32], t1w
+%endmacro
+
+%macro DEQUANT_STORE_MMX 1
+    DEQUANT_MMX m0, m1, %1
+    STORE_WORDS_MMX m0,  0,  1,  4,  5
+    STORE_WORDS_MMX m1,  2,  3,  6,  7
+
+    DEQUANT_MMX m2, m3, %1
+    STORE_WORDS_MMX m2,  8,  9, 12, 13
+    STORE_WORDS_MMX m3, 10, 11, 14, 15
+%endmacro
+
+%macro STORE_WORDS_SSE 9
+    movd  t0d, %1
+    psrldq  %1, 4
+    movd  t1d, %1
+    psrldq  %1, 4
+    mov [t2+%2*32], t0w
+    mov [t2+%4*32], t1w
+    shr   t0d, 16
+    shr   t1d, 16
+    mov [t2+%3*32], t0w
+    mov [t2+%5*32], t1w
+    movd  t0d, %1
+    psrldq  %1, 4
+    movd  t1d, %1
+    mov [t2+%6*32], t0w
+    mov [t2+%8*32], t1w
+    shr   t0d, 16
+    shr   t1d, 16
+    mov [t2+%7*32], t0w
+    mov [t2+%9*32], t1w
+%endmacro
+
+%macro DEQUANT_STORE_SSE2 1
+    movd      xmm4, t3d
+    movq      xmm5, [pw_1]
+    pshufd    xmm4, xmm4, 0
+    movq2dq   xmm0, m0
+    movq2dq   xmm1, m1
+    movq2dq   xmm2, m2
+    movq2dq   xmm3, m3
+    punpcklwd xmm0, xmm5
+    punpcklwd xmm1, xmm5
+    punpcklwd xmm2, xmm5
+    punpcklwd xmm3, xmm5
+    pmaddwd   xmm0, xmm4
+    pmaddwd   xmm1, xmm4
+    pmaddwd   xmm2, xmm4
+    pmaddwd   xmm3, xmm4
+    psrad     xmm0, %1
+    psrad     xmm1, %1
+    psrad     xmm2, %1
+    psrad     xmm3, %1
+    packssdw  xmm0, xmm1
+    packssdw  xmm2, xmm3
+    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
+%endmacro
+
+%macro IDCT_DC_DEQUANT 2
+cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
+    movq        m3, [r1+24]
+    movq        m2, [r1+16]
+    movq        m1, [r1+ 8]
+    movq        m0, [r1+ 0]
+    WALSH4_1D    0,1,2,3,4
+    TRANSPOSE4x4W 0,1,2,3,4
+    WALSH4_1D    0,1,2,3,4
+
+; shift, tmp, output, qmul
+%ifdef WIN64
+    DECLARE_REG_TMP 0,3,1,2
+    ; we can't avoid this, because r0 is the shift register (ecx) on win64
+    xchg        r0, t2
+%elifdef ARCH_X86_64
+    DECLARE_REG_TMP 3,1,0,2
+%else
+    DECLARE_REG_TMP 1,3,0,2
+%endif
+
+    cmp        t3d, 32767
+    jg .big_qmul
+    add        t3d, 128 << 16
+%ifidn %1,mmx
+    DEQUANT_STORE_MMX 8
+%else
+    DEQUANT_STORE_SSE2 8
+%endif
+    RET
+.big_qmul:
+    bsr        t0d, t3d
+    add        t3d, 128 << 16
+    mov        t1d, 7
+    cmp        t0d, t1d
+    cmovg      t0d, t1d
+    inc        t1d
+    shr        t3d, t0b
+    sub        t1d, t0d
+%ifidn %1,mmx
+    movd        m6, t1d
+    DEQUANT_STORE_MMX m6
+%else
+    movd      xmm6, t1d
+    DEQUANT_STORE_SSE2 xmm6
+%endif
+    RET
+%endmacro
+
+INIT_MMX
+IDCT_DC_DEQUANT mmx, 0
+IDCT_DC_DEQUANT sse2, 7
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 401a488cb5..d9e45f8b03 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
                                   int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_sse2      (uint8_t **dest, const int *block_offset, DCTELEM *block,
                                   int stride, const uint8_t nnzc[6*8]);
+void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
 
 /***********************************/
 /* deblocking */
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
         c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
         c->h264_idct_add8      = ff_h264_idct_add8_mmx;
         c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+        c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
 
         if (mm_flags & AV_CPU_FLAG_MMX2) {
             c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
             if (mm_flags&AV_CPU_FLAG_SSE2) {
                 c->h264_idct8_add = ff_h264_idct8_add_sse2;
                 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
                 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
                 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;