more speed

Originally committed as revision 2429 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
This commit is contained in:
Michael Niedermayer 2001-10-23 15:55:54 +00:00
parent ac0b0b2f6d
commit 9736722ad7
2 changed files with 116 additions and 124 deletions

View File

@ -60,6 +60,7 @@ compare the quality & speed of all filters
split this huge file split this huge file
fix warnings (unused vars, ...) fix warnings (unused vars, ...)
noise reduction filters noise reduction filters
write an exact implementation of the horizontal delocking filter
... ...
Notes: Notes:
@ -1450,7 +1451,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
{ {
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"pushl %0 \n\t" "leal (%0, %1), %%ecx \n\t"
"leal (%%ecx, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"movq bm00001000, %%mm6 \n\t" "movq bm00001000, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP "movd %2, %%mm5 \n\t" // QP
@ -1464,10 +1468,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
//FIXME? "unroll by 2" and mix //FIXME? "unroll by 2" and mix
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
#define HDF(i) \ #define HDF(src, dst) \
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq " #src "(%%eax), %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\ "movq " #src "(%%eax), %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\ "psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\ "psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\
@ -1486,12 +1490,12 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\ "psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\ "paddb %%mm1, %%mm0 \n\t"\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst" \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst" \n\t"
#else #else
#define HDF(i)\ #define HDF(src, dst)\
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\ "movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\ "psrlq $8, %%mm1 \n\t"\
@ -1515,29 +1519,21 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\ "psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\ "paddb %%mm1, %%mm0 \n\t"\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst " \n\t"
#endif #endif
HDF(0) HDF(0,(%0))
"addl %1, %0 \n\t" HDF(8,(%%ecx))
HDF(8) HDF(16,(%%ecx, %1))
"addl %1, %0 \n\t" HDF(24,(%%ecx, %1, 2))
HDF(16) HDF(32,(%0, %1, 4))
"addl %1, %0 \n\t" HDF(40,(%%ebx))
HDF(24) HDF(48,(%%ebx, %1))
"addl %1, %0 \n\t" HDF(56,(%%ebx, %1, 2))
HDF(32)
"addl %1, %0 \n\t"
HDF(40)
"addl %1, %0 \n\t"
HDF(48)
"addl %1, %0 \n\t"
HDF(56)
"popl %0 \n\t"
: :
: "r" (dst), "r" (stride), "r" (QP) : "r" (dst), "r" (stride), "r" (QP)
: "%eax" : "%eax", "%ebx", "%ecx"
); );
#else #else
uint8_t *src= tempBlock; uint8_t *src= tempBlock;
@ -1597,8 +1593,11 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
{ {
//return; //return;
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile( //"movv %0 %1 %2\n\t" asm volatile(
"pushl %0\n\t" "leal (%0, %1), %%ecx \n\t"
"leal (%%ecx, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"leal tempBlock, %%eax \n\t" "leal tempBlock, %%eax \n\t"
/* /*
@ -1714,20 +1713,20 @@ Implemented Exact 7-Tap
#endif #endif
/* uses the 7-Tap Filter: 1112111 */ /* uses the 7-Tap Filter: 1112111 */
#define NEW_HLP(i)\ #define NEW_HLP(src, dst)\
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm1 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq " #src "(%%eax), %%mm2 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"movd -4(%0), %%mm3 \n\t" /*0001000*/\
"movd 8(%0), %%mm4 \n\t" /*0001000*/\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"psrlq $8, %%mm2 \n\t"\ "psrlq $8, %%mm2 \n\t"\
"movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
"movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
"psrlq $24, %%mm3 \n\t"\ "psrlq $24, %%mm3 \n\t"\
"psllq $56, %%mm4 \n\t"\ "psllq $56, %%mm4 \n\t"\
"por %%mm3, %%mm1 \n\t"\ "por %%mm3, %%mm1 \n\t"\
"por %%mm4, %%mm2 \n\t"\ "por %%mm4, %%mm2 \n\t"\
"movq %%mm1, %%mm5 \n\t"\ "movq %%mm1, %%mm5 \n\t"\
PAVGB(%%mm2, %%mm1)\ PAVGB(%%mm2, %%mm1)\
"movq " #src "(%%eax), %%mm0 \n\t"\
PAVGB(%%mm1, %%mm0)\ PAVGB(%%mm1, %%mm0)\
"psllq $8, %%mm5 \n\t"\ "psllq $8, %%mm5 \n\t"\
"psrlq $8, %%mm2 \n\t"\ "psrlq $8, %%mm2 \n\t"\
@ -1742,9 +1741,9 @@ Implemented Exact 7-Tap
PAVGB(%%mm2, %%mm1)\ PAVGB(%%mm2, %%mm1)\
PAVGB(%%mm1, %%mm5)\ PAVGB(%%mm1, %%mm5)\
PAVGB(%%mm5, %%mm0)\ PAVGB(%%mm5, %%mm0)\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst " \n\t"
/* uses the 9-Tap Filter: 112242211 */ /* uses the 9-Tap Filter: 112242211 */
#define NEW_HLP2(i)\ #define NEW_HLP2(i)\
@ -1786,28 +1785,20 @@ Implemented Exact 7-Tap
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4(%0) \n\t"
#define HLP(i) NEW_HLP(i) #define HLP(src, dst) NEW_HLP(src, dst)
HLP(0) HLP(0, (%0))
"addl %1, %0 \n\t" HLP(8, (%%ecx))
HLP(8) HLP(16, (%%ecx, %1))
"addl %1, %0 \n\t" HLP(24, (%%ecx, %1, 2))
HLP(16) HLP(32, (%0, %1, 4))
"addl %1, %0 \n\t" HLP(40, (%%ebx))
HLP(24) HLP(48, (%%ebx, %1))
"addl %1, %0 \n\t" HLP(56, (%%ebx, %1, 2))
HLP(32)
"addl %1, %0 \n\t"
HLP(40)
"addl %1, %0 \n\t"
HLP(48)
"addl %1, %0 \n\t"
HLP(56)
"popl %0\n\t"
: :
: "r" (dst), "r" (stride) : "r" (dst), "r" (stride)
: "%eax", "%ebx" : "%eax", "%ebx", "%ecx"
); );
#else #else
@ -2743,10 +2734,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride; const int stride= dstStride;
int QP= isColor ? int QP;
QPs[(y>>3)*QPStride + (x>>3)]: if(isColor)
QPs[(y>>4)*QPStride + (x>>4)]; {
if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; QP=QPs[(y>>3)*QPStride + (x>>3)];
}
else
{
QP= QPs[(y>>4)*QPStride + (x>>4)];
if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
yHistogram[ srcBlock[srcStride*5] ]++;
}
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"movd %0, %%mm7 \n\t" "movd %0, %%mm7 \n\t"
@ -2776,8 +2774,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
*/ */
#endif #endif
if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
#ifdef PP_FUNNY_STRIDE #ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again //can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width) if(x+7 >= width)

View File

@ -60,6 +60,7 @@ compare the quality & speed of all filters
split this huge file split this huge file
fix warnings (unused vars, ...) fix warnings (unused vars, ...)
noise reduction filters noise reduction filters
write an exact implementation of the horizontal delocking filter
... ...
Notes: Notes:
@ -1450,7 +1451,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
{ {
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"pushl %0 \n\t" "leal (%0, %1), %%ecx \n\t"
"leal (%%ecx, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"movq bm00001000, %%mm6 \n\t" "movq bm00001000, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" // QP "movd %2, %%mm5 \n\t" // QP
@ -1464,10 +1468,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
//FIXME? "unroll by 2" and mix //FIXME? "unroll by 2" and mix
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
#define HDF(i) \ #define HDF(src, dst) \
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq " #src "(%%eax), %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\ "movq " #src "(%%eax), %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\ "psrlq $8, %%mm1 \n\t"\
"psubusb %%mm1, %%mm2 \n\t"\ "psubusb %%mm1, %%mm2 \n\t"\
"psubusb %%mm0, %%mm1 \n\t"\ "psubusb %%mm0, %%mm1 \n\t"\
@ -1486,12 +1490,12 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\ "psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\ "paddb %%mm1, %%mm0 \n\t"\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst" \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst" \n\t"
#else #else
#define HDF(i)\ #define HDF(src, dst)\
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm0 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq %%mm0, %%mm1 \n\t"\
"movq %%mm0, %%mm2 \n\t"\ "movq %%mm0, %%mm2 \n\t"\
"psrlq $8, %%mm1 \n\t"\ "psrlq $8, %%mm1 \n\t"\
@ -1515,29 +1519,21 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
"psubb %%mm1, %%mm0 \n\t"\ "psubb %%mm1, %%mm0 \n\t"\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"paddb %%mm1, %%mm0 \n\t"\ "paddb %%mm1, %%mm0 \n\t"\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst " \n\t"
#endif #endif
HDF(0) HDF(0,(%0))
"addl %1, %0 \n\t" HDF(8,(%%ecx))
HDF(8) HDF(16,(%%ecx, %1))
"addl %1, %0 \n\t" HDF(24,(%%ecx, %1, 2))
HDF(16) HDF(32,(%0, %1, 4))
"addl %1, %0 \n\t" HDF(40,(%%ebx))
HDF(24) HDF(48,(%%ebx, %1))
"addl %1, %0 \n\t" HDF(56,(%%ebx, %1, 2))
HDF(32)
"addl %1, %0 \n\t"
HDF(40)
"addl %1, %0 \n\t"
HDF(48)
"addl %1, %0 \n\t"
HDF(56)
"popl %0 \n\t"
: :
: "r" (dst), "r" (stride), "r" (QP) : "r" (dst), "r" (stride), "r" (QP)
: "%eax" : "%eax", "%ebx", "%ecx"
); );
#else #else
uint8_t *src= tempBlock; uint8_t *src= tempBlock;
@ -1597,8 +1593,11 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
{ {
//return; //return;
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile( //"movv %0 %1 %2\n\t" asm volatile(
"pushl %0\n\t" "leal (%0, %1), %%ecx \n\t"
"leal (%%ecx, %1, 4), %%ebx \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"leal tempBlock, %%eax \n\t" "leal tempBlock, %%eax \n\t"
/* /*
@ -1714,20 +1713,20 @@ Implemented Exact 7-Tap
#endif #endif
/* uses the 7-Tap Filter: 1112111 */ /* uses the 7-Tap Filter: 1112111 */
#define NEW_HLP(i)\ #define NEW_HLP(src, dst)\
"movq " #i "(%%eax), %%mm0 \n\t"\ "movq " #src "(%%eax), %%mm1 \n\t"\
"movq %%mm0, %%mm1 \n\t"\ "movq " #src "(%%eax), %%mm2 \n\t"\
"movq %%mm0, %%mm2 \n\t"\
"movd -4(%0), %%mm3 \n\t" /*0001000*/\
"movd 8(%0), %%mm4 \n\t" /*0001000*/\
"psllq $8, %%mm1 \n\t"\ "psllq $8, %%mm1 \n\t"\
"psrlq $8, %%mm2 \n\t"\ "psrlq $8, %%mm2 \n\t"\
"movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
"movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
"psrlq $24, %%mm3 \n\t"\ "psrlq $24, %%mm3 \n\t"\
"psllq $56, %%mm4 \n\t"\ "psllq $56, %%mm4 \n\t"\
"por %%mm3, %%mm1 \n\t"\ "por %%mm3, %%mm1 \n\t"\
"por %%mm4, %%mm2 \n\t"\ "por %%mm4, %%mm2 \n\t"\
"movq %%mm1, %%mm5 \n\t"\ "movq %%mm1, %%mm5 \n\t"\
PAVGB(%%mm2, %%mm1)\ PAVGB(%%mm2, %%mm1)\
"movq " #src "(%%eax), %%mm0 \n\t"\
PAVGB(%%mm1, %%mm0)\ PAVGB(%%mm1, %%mm0)\
"psllq $8, %%mm5 \n\t"\ "psllq $8, %%mm5 \n\t"\
"psrlq $8, %%mm2 \n\t"\ "psrlq $8, %%mm2 \n\t"\
@ -1742,9 +1741,9 @@ Implemented Exact 7-Tap
PAVGB(%%mm2, %%mm1)\ PAVGB(%%mm2, %%mm1)\
PAVGB(%%mm1, %%mm5)\ PAVGB(%%mm1, %%mm5)\
PAVGB(%%mm5, %%mm0)\ PAVGB(%%mm5, %%mm0)\
"movd %%mm0, (%0) \n\t"\ "movd %%mm0, " #dst " \n\t"\
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4" #dst " \n\t"
/* uses the 9-Tap Filter: 112242211 */ /* uses the 9-Tap Filter: 112242211 */
#define NEW_HLP2(i)\ #define NEW_HLP2(i)\
@ -1786,28 +1785,20 @@ Implemented Exact 7-Tap
"psrlq $32, %%mm0 \n\t"\ "psrlq $32, %%mm0 \n\t"\
"movd %%mm0, 4(%0) \n\t" "movd %%mm0, 4(%0) \n\t"
#define HLP(i) NEW_HLP(i) #define HLP(src, dst) NEW_HLP(src, dst)
HLP(0) HLP(0, (%0))
"addl %1, %0 \n\t" HLP(8, (%%ecx))
HLP(8) HLP(16, (%%ecx, %1))
"addl %1, %0 \n\t" HLP(24, (%%ecx, %1, 2))
HLP(16) HLP(32, (%0, %1, 4))
"addl %1, %0 \n\t" HLP(40, (%%ebx))
HLP(24) HLP(48, (%%ebx, %1))
"addl %1, %0 \n\t" HLP(56, (%%ebx, %1, 2))
HLP(32)
"addl %1, %0 \n\t"
HLP(40)
"addl %1, %0 \n\t"
HLP(48)
"addl %1, %0 \n\t"
HLP(56)
"popl %0\n\t"
: :
: "r" (dst), "r" (stride) : "r" (dst), "r" (stride)
: "%eax", "%ebx" : "%eax", "%ebx", "%ecx"
); );
#else #else
@ -2743,10 +2734,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride; const int stride= dstStride;
int QP= isColor ? int QP;
QPs[(y>>3)*QPStride + (x>>3)]: if(isColor)
QPs[(y>>4)*QPStride + (x>>4)]; {
if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; QP=QPs[(y>>3)*QPStride + (x>>3)];
}
else
{
QP= QPs[(y>>4)*QPStride + (x>>4)];
if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
yHistogram[ srcBlock[srcStride*5] ]++;
}
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
"movd %0, %%mm7 \n\t" "movd %0, %%mm7 \n\t"
@ -2776,8 +2774,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
*/ */
#endif #endif
if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
#ifdef PP_FUNNY_STRIDE #ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again //can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width) if(x+7 >= width)