1
mirror of https://code.videolan.org/videolan/vlc synced 2024-10-03 01:31:53 +02:00

deinterlace: purge MMX/MMXEXT

notes:
 - this removes all SIMD acceleration for x86/x86_64. originally this work
   started by converting the MMX code to SSE2, then purged remaining
   artifacts, but a build error on android has blocked that work from
   being merged for now. this commit thus takes a different approach of
   simply purging the old MMX/MMXEXT code first, with getting the SSE2
   implementation working to be done as a follow up.
 - the `EndMMX()` function is retained (renamed to `EndSSE()`) because it
   is still used under the merge code. the `emms` instruction will be
   replaced with an `sfence` instruction separately, as more appropriate.
This commit is contained in:
Lyndon Brown 2019-01-26 08:51:36 +00:00 committed by François Cartegnie
parent e6bb48cc15
commit 831fa3cd0f
14 changed files with 15 additions and 931 deletions

View File

@ -159,7 +159,6 @@ noinst_LTLIBRARIES += libdeinterlace_common.la
libdeinterlace_plugin_la_SOURCES = \
video_filter/deinterlace/deinterlace.c video_filter/deinterlace/deinterlace.h \
video_filter/deinterlace/mmx.h \
video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \

View File

@ -24,10 +24,6 @@
# include "config.h"
#endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
#endif
#include <stdint.h>
#include <assert.h>

View File

@ -24,11 +24,6 @@
# include "config.h"
#endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
# include <stdalign.h>
#endif
#include <stdint.h>
#include <assert.h>
@ -87,7 +82,7 @@ static void DarkenField( picture_t *p_dst,
For luma, the operation is just a shift + bitwise AND, so we vectorize
even in the C version.
There is an MMX version too, because it performs about twice faster.
There are SIMD versions too, which perform significantly faster.
*/
int i_plane = Y_PLANE;
uint8_t *p_out, *p_out_end;
@ -120,7 +115,7 @@ static void DarkenField( picture_t *p_dst,
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
The chroma processing is a bit more complicated than luma,
and needs MMX for vectorization.
and needs SIMD for vectorization.
*/
if( process_chroma )
{
@ -148,129 +143,6 @@ static void DarkenField( picture_t *p_dst,
} /* if process_chroma */
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static void DarkenFieldMMX( picture_t *p_dst,
const int i_field, const int i_strength,
bool process_chroma )
{
assert( p_dst != NULL );
assert( i_field == 0 || i_field == 1 );
assert( i_strength >= 1 && i_strength <= 3 );
uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
const uint8_t remove_high_u8 = 0xFF >> i_strength;
const uint64_t remove_high_u64 = remove_high_u8 *
INT64_C(0x0101010101010101);
int i_plane = Y_PLANE;
uint8_t *p_out, *p_out_end;
int w = p_dst->p[i_plane].i_visible_pitch;
p_out = p_dst->p[i_plane].p_pixels;
p_out_end = p_out + p_dst->p[i_plane].i_pitch
* p_dst->p[i_plane].i_visible_lines;
/* skip first line for bottom field */
if( i_field == 1 )
p_out += p_dst->p[i_plane].i_pitch;
int wm8 = w % 8; /* remainder */
int w8 = w - wm8; /* part of width that is divisible by 8 */
for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
{
uint64_t *po = (uint64_t *)p_out;
int x = 0;
movq_m2r( i_strength_u64, mm1 );
movq_m2r( remove_high_u64, mm2 );
for( ; x < w8; x += 8 )
{
movq_m2r( (*po), mm0 );
psrlq_r2r( mm1, mm0 );
pand_r2r( mm2, mm0 );
movq_r2m( mm0, (*po++) );
}
/* handle the width remainder */
uint8_t *po_temp = (uint8_t *)po;
for( ; x < w; ++x, ++po_temp )
(*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
}
/* Process chroma if the field chromas are independent.
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
The chroma processing is a bit more complicated than luma,
and needs MMX for vectorization.
*/
if( process_chroma )
{
for( i_plane++ /* luma already handled */;
i_plane < p_dst->i_planes;
i_plane++ )
{
w = p_dst->p[i_plane].i_visible_pitch;
wm8 = w % 8; /* remainder */
w8 = w - wm8; /* part of width that is divisible by 8 */
p_out = p_dst->p[i_plane].p_pixels;
p_out_end = p_out + p_dst->p[i_plane].i_pitch
* p_dst->p[i_plane].i_visible_lines;
/* skip first line for bottom field */
if( i_field == 1 )
p_out += p_dst->p[i_plane].i_pitch;
for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
{
int x = 0;
/* See also easy-to-read C version below. */
static alignas (8) const mmx_t b128 = {
.uq = 0x8080808080808080ULL
};
movq_m2r( b128, mm5 );
movq_m2r( i_strength_u64, mm6 );
movq_m2r( remove_high_u64, mm7 );
uint64_t *po8 = (uint64_t *)p_out;
for( ; x < w8; x += 8 )
{
movq_m2r( (*po8), mm0 );
movq_r2r( mm5, mm2 ); /* 128 */
movq_r2r( mm0, mm1 ); /* copy of data */
psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
/* >> i_strength */
psrlq_r2r( mm6, mm1 );
psrlq_r2r( mm6, mm2 );
pand_r2r( mm7, mm1 );
pand_r2r( mm7, mm2 );
/* collect results from pos./neg. parts */
psubb_r2r( mm2, mm1 );
paddb_r2r( mm5, mm1 );
movq_r2m( mm1, (*po8++) );
}
/* C version - handle the width remainder */
uint8_t *po = p_out;
for( ; x < w; ++x, ++po )
(*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
} /* for p_out... */
} /* for i_plane... */
} /* if process_chroma */
emms();
}
#endif
/*****************************************************************************
* Public functions
*****************************************************************************/
@ -357,13 +229,6 @@ int RenderPhosphor( filter_t *p_filter,
*/
if( p_sys->phosphor.i_dimmer_strength > 0 )
{
#ifdef CAN_COMPILE_MMXEXT
if( vlc_CPU_MMXEXT() )
DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
else
#endif
DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );

View File

@ -24,10 +24,6 @@
# include "config.h"
#endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
#endif
#include <stdint.h>
#include <vlc_common.h>
@ -76,71 +72,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
return fc < 1 ? false : true;
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
{
int y, x;
int32_t ff, fr;
int fc;
/* Detect interlacing */
fc = 0;
pxor_r2r( mm7, mm7 );
for( y = 0; y < 9; y += 2 )
{
ff = fr = 0;
pxor_r2r( mm5, mm5 );
pxor_r2r( mm6, mm6 );
for( x = 0; x < 8; x+=4 )
{
movd_m2r( src[ x], mm0 );
movd_m2r( src[1*i_src+x], mm1 );
movd_m2r( src[2*i_src+x], mm2 );
movd_m2r( src[3*i_src+x], mm3 );
punpcklbw_r2r( mm7, mm0 );
punpcklbw_r2r( mm7, mm1 );
punpcklbw_r2r( mm7, mm2 );
punpcklbw_r2r( mm7, mm3 );
movq_r2r( mm0, mm4 );
psubw_r2r( mm1, mm0 );
psubw_r2r( mm2, mm4 );
psubw_r2r( mm1, mm2 );
psubw_r2r( mm1, mm3 );
pmaddwd_r2r( mm0, mm0 );
pmaddwd_r2r( mm4, mm4 );
pmaddwd_r2r( mm2, mm2 );
pmaddwd_r2r( mm3, mm3 );
paddd_r2r( mm0, mm2 );
paddd_r2r( mm4, mm3 );
paddd_r2r( mm2, mm5 );
paddd_r2r( mm3, mm6 );
}
movq_r2r( mm5, mm0 );
psrlq_i2r( 32, mm0 );
paddd_r2r( mm0, mm5 );
movd_r2m( mm5, fr );
movq_r2r( mm6, mm0 );
psrlq_i2r( 32, mm0 );
paddd_r2r( mm0, mm6 );
movd_r2m( mm6, ff );
if( ff < 6*fr/8 && fr > 32 )
fc++;
src += 2*i_src;
}
return fc;
}
#endif
static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
@ -163,49 +94,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
}
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
uint8_t *src2, int i_src2 )
{
static const uint64_t m_4 = INT64_C(0x0004000400040004);
int y, x;
/* Progressive */
pxor_r2r( mm7, mm7 );
for( y = 0; y < 8; y += 2 )
{
for( x = 0; x < 8; x +=4 )
{
movd_m2r( src1[x], mm0 );
movd_r2m( mm0, dst[x] );
movd_m2r( src2[x], mm1 );
movd_m2r( src1[i_src1+x], mm2 );
punpcklbw_r2r( mm7, mm0 );
punpcklbw_r2r( mm7, mm1 );
punpcklbw_r2r( mm7, mm2 );
paddw_r2r( mm1, mm1 );
movq_r2r( mm1, mm3 );
paddw_r2r( mm3, mm3 );
paddw_r2r( mm2, mm0 );
paddw_r2r( mm3, mm1 );
paddw_m2r( m_4, mm1 );
paddw_r2r( mm1, mm0 );
psraw_i2r( 3, mm0 );
packuswb_r2r( mm7, mm0 );
movd_r2m( mm0, dst[i_dst+x] );
}
dst += 2*i_dst;
src1 += i_src1;
src2 += i_src2;
}
}
#endif
/* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
* neighbour
* (Use 8x9 pixels)
@ -229,31 +117,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
}
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
uint8_t *src, int i_src )
{
int y;
/* Interlaced */
for( y = 0; y < 8; y += 2 )
{
movq_m2r( src[0], mm0 );
movq_r2m( mm0, dst[0] );
dst += i_dst;
movq_m2r( src[2*i_src], mm1 );
pavgb_r2r( mm1, mm0 );
movq_r2m( mm0, dst[0] );
dst += 1*i_dst;
src += 2*i_src;
}
}
#endif
/* XDeint8x8Field: Edge oriented interpolation
* (Need -4 and +5 pixels H, +1 line)
*/
@ -271,7 +134,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
for( x = 0; x < 8; x++ )
{
uint8_t *src2 = &src[2*i_src];
/* I use 8 pixels just to match the MMX version, but it's overkill
/* I use 8 pixels just to match the SIMD version, but it's overkill
* 5 would be enough (less isn't good) */
const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
@ -301,50 +164,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
}
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
uint8_t *src, int i_src )
{
int y, x;
/* Interlaced */
for( y = 0; y < 8; y += 2 )
{
memcpy( dst, src, 8 );
dst += i_dst;
for( x = 0; x < 8; x++ )
{
uint8_t *src2 = &src[2*i_src];
int32_t c0, c1, c2;
movq_m2r( src[x-2], mm0 );
movq_m2r( src[x-3], mm1 );
movq_m2r( src[x-4], mm2 );
psadbw_m2r( src2[x-4], mm0 );
psadbw_m2r( src2[x-3], mm1 );
psadbw_m2r( src2[x-2], mm2 );
movd_r2m( mm0, c2 );
movd_r2m( mm1, c1 );
movd_r2m( mm2, c0 );
if( c0 < c1 && c1 <= c2 )
dst[x] = (src[x-1] + src2[x+1]) >> 1;
else if( c2 < c1 && c1 <= c0 )
dst[x] = (src[x+1] + src2[x-1]) >> 1;
else
dst[x] = (src[x+0] + src2[x+0]) >> 1;
}
dst += 1*i_dst;
src += 2*i_src;
}
}
#endif
/* NxN arbitray size (and then only use pixel in the NxN block)
*/
static inline int XDeintNxNDetect( uint8_t *src, int i_src,
@ -472,41 +291,6 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
uint8_t *src, int i_src,
const int i_mbx, int i_modx )
{
int x;
/* Reset current line */
for( x = 0; x < i_mbx; x++ )
{
int s;
if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
{
if( x == 0 || x == i_mbx - 1 )
XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
else
XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
}
else
{
XDeint8x8MergeMMXEXT( dst, i_dst,
&src[0*i_src], 2*i_src,
&src[1*i_src], 2*i_src );
}
dst += 8;
src += 8;
}
if( i_modx )
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
}
#endif
/*****************************************************************************
* Public functions
*****************************************************************************/
@ -515,9 +299,6 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
{
VLC_UNUSED(p_filter);
int i_plane;
#if defined (CAN_COMPILE_MMXEXT)
const bool mmxext = vlc_CPU_MMXEXT();
#endif
/* Copy image and skip lines */
for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
@ -538,12 +319,7 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
#ifdef CAN_COMPILE_MMXEXT
if( mmxext )
XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
else
#endif
XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
}
/* Last line (C only)*/
@ -565,9 +341,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
}
}
#ifdef CAN_COMPILE_MMXEXT
if( mmxext )
emms();
#endif
return VLC_SUCCESS;
}

View File

@ -33,13 +33,13 @@ struct picture_t;
/**
* Interpolating deinterlace filter "X".
*
* The algorithm works on a 8x8 block basic, it copies the top field
* The algorithm works on a 8x8 block basis; It copies the top field
* and applies a process to recreate the bottom field.
*
* If a 8x8 block is classified as :
* - progressive: it applies a small blend (1,6,1)
* - interlaced:
* * in the MMX version: we do a ME between the 2 fields, if there is a
* * in the SIMD version: we do a ME between the 2 fields, if there is a
* good match we use MC to recreate the bottom field (with a small
* blend (1,6,1) )
* * otherwise: it recreates the bottom field by an edge oriented

View File

@ -119,11 +119,6 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
if( vlc_CPU_SSE2() )
filter = vlcpriv_yadif_filter_line_sse2;
else
#if defined(__i386__)
if( vlc_CPU_MMXEXT() )
filter = vlcpriv_yadif_filter_line_mmxext;
else
#endif
#endif
filter = yadif_filter_line_c;

View File

@ -558,15 +558,7 @@ notsupp:
if( vlc_CPU_SSE2() )
{
p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
p_sys->pf_end_merge = EndMMX;
}
else
#endif
#if defined(CAN_COMPILE_MMXEXT)
if( pixel_size == 1 && vlc_CPU_MMXEXT() )
{
p_sys->pf_merge = MergeMMXEXT;
p_sys->pf_end_merge = EndMMX;
p_sys->pf_end_merge = EndSSE;
}
else
#endif

View File

@ -68,7 +68,7 @@ typedef struct
{
const vlc_chroma_description_t *chroma;
/** Merge routine: C, MMX, SSE, ALTIVEC, NEON, ... */
/** Merge routine: C, SSE, ALTIVEC, NEON, ... */
void (*pf_merge) ( void *, const void *, const void *, size_t );
#if defined (__i386__) || defined (__x86_64__)
/** Merge finalization routine for SSE */

View File

@ -24,11 +24,6 @@
# include "config.h"
#endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
# include <stdalign.h>
#endif
#include <stdint.h>
#include <assert.h>
@ -107,9 +102,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* For interpretation of pi_top and pi_bot, it is assumed that the block
* starts on an even-numbered line (belonging to the top field).
*
* The b_mmx parameter avoids the need to call vlc_CPU() separately
* for each block.
*
* @param[in] p_pix_p Base pointer to the block in previous picture
* @param[in] p_pix_c Base pointer to the same block in current picture
* @param i_pitch_prev i_pitch of previous picture
@ -172,79 +164,6 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
changes "enough". */
return (i_motion >= 8);
}
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
int* pi_top, int* pi_bot )
{
int32_t i_motion = 0;
int32_t i_top_motion = 0;
int32_t i_bot_motion = 0;
static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
movq_m2r( bT, mm5 );
pxor_r2r( mm3, mm3 ); /* score (top field) */
pxor_r2r( mm4, mm4 ); /* score (bottom field) */
for( int y = 0; y < 8; y+=2 )
{
/* top field */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm3 ); /* add to top field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
/* bottom field - handling identical to top field, except... */
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
movq_r2r( mm0, mm2 );
psubusb_r2r( mm1, mm2 );
psubusb_r2r( mm0, mm1 );
pcmpgtb_r2r( mm5, mm2 );
pcmpgtb_r2r( mm5, mm1 );
psadbw_r2r( mm6, mm2 );
psadbw_r2r( mm6, mm1 );
paddd_r2r( mm2, mm1 );
paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
movq_r2r( mm3, mm7 ); /* score (total) */
paddd_r2r( mm4, mm7 );
movd_r2m( mm3, i_top_motion );
movd_r2m( mm4, i_bot_motion );
movd_r2m( mm7, i_motion );
/* The loop counts actual score * 255. */
i_top_motion /= 255;
i_bot_motion /= 255;
i_motion /= 255;
emms();
(*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 );
return (i_motion >= 8);
}
#endif
#undef T
/*****************************************************************************
@ -396,11 +315,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
TestForMotionInBlock;
/* We must tell our inline helper whether to use MMX acceleration. */
#ifdef CAN_COMPILE_MMXEXT
if (vlc_CPU_MMXEXT())
motion_in_block = TestForMotionInBlockMMX;
#endif
int i_score = 0;
for( int i_plane = 0 ; i_plane < p_prev->i_planes ; i_plane++ )
@ -451,142 +365,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
/* Threshold (value from Transcode 1.1.5) */
#define T 100
#ifdef CAN_COMPILE_MMXEXT
VLC_MMX
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
assert( p_pic_top->i_planes == p_pic_bot->i_planes );
/* Amount of bits must be known for MMX, thus int32_t.
Doesn't hurt the C implementation. */
int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
/* Sanity check */
if( p_pic_top->p[i_plane].i_visible_lines !=
p_pic_bot->p[i_plane].i_visible_lines )
return -1;
const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
p_pic_bot->p[i_plane].i_visible_pitch );
const int wm8 = w % 8; /* remainder */
const int w8 = w - wm8; /* part of width that is divisible by 8 */
/* Current line / neighbouring lines picture pointers */
const picture_t *cur = p_pic_bot;
const picture_t *ngh = p_pic_top;
int wc = cur->p[i_plane].i_pitch;
int wn = ngh->p[i_plane].i_pitch;
/* Transcode 1.1.5 only checks every other line. Checking every line
works better for anime, which may contain horizontal,
one pixel thick cartoon outlines.
*/
for( int y = 1; y < i_lasty; ++y )
{
uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
int x = 0;
/* Easy-to-read C version further below.
Assumptions: 0 < T < 127
# of pixels < (2^32)/255
Note: calculates score * 255
*/
static alignas (8) const mmx_t b0 = {
.uq = 0x0000000000000000ULL };
static alignas (8) const mmx_t b128 = {
.uq = 0x8080808080808080ULL };
static alignas (8) const mmx_t bT = {
.ub = { T, T, T, T, T, T, T, T } };
for( ; x < w8; x += 8 )
{
movq_m2r( *((int64_t*)p_c), mm0 );
movq_m2r( *((int64_t*)p_p), mm1 );
movq_m2r( *((int64_t*)p_n), mm2 );
psubb_m2r( b128, mm0 );
psubb_m2r( b128, mm1 );
psubb_m2r( b128, mm2 );
psubsb_r2r( mm0, mm1 );
psubsb_r2r( mm0, mm2 );
pxor_r2r( mm3, mm3 );
pxor_r2r( mm4, mm4 );
pxor_r2r( mm5, mm5 );
pxor_r2r( mm6, mm6 );
punpcklbw_r2r( mm1, mm3 );
punpcklbw_r2r( mm2, mm4 );
punpckhbw_r2r( mm1, mm5 );
punpckhbw_r2r( mm2, mm6 );
pmulhw_r2r( mm3, mm4 );
pmulhw_r2r( mm5, mm6 );
packsswb_r2r(mm4, mm6);
pcmpgtb_m2r( bT, mm6 );
psadbw_m2r( b0, mm6 );
paddd_r2r( mm6, mm7 );
p_c += 8;
p_p += 8;
p_n += 8;
}
for( ; x < w; ++x )
{
/* Worst case: need 17 bits for "comb". */
int_fast32_t C = *p_c;
int_fast32_t P = *p_p;
int_fast32_t N = *p_n;
/* Comments in Transcode's filter_ivtc.c attribute this
combing metric to Gunnar Thalin.
The idea is that if the picture is interlaced, both
expressions will have the same sign, and this comes
up positive. The value T = 100 has been chosen such
that a pixel difference of 10 (on average) will
trigger the detector.
*/
int_fast32_t comb = (P - C) * (N - C);
if( comb > T )
++i_score_c;
++p_c;
++p_p;
++p_n;
}
/* Now the other field - swap current and neighbour pictures */
const picture_t *tmp = cur;
cur = ngh;
ngh = tmp;
int tmp_pitch = wc;
wc = wn;
wn = tmp_pitch;
}
}
movd_r2m( mm7, i_score_mmx );
emms();
return i_score_mmx/255 + i_score_c;
}
#endif
/* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
@ -607,11 +385,6 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
#ifdef CAN_COMPILE_MMXEXT
if (vlc_CPU_MMXEXT())
return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
#endif
int32_t i_score = 0;
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )

View File

@ -33,10 +33,6 @@
#include <vlc_cpu.h>
#include "merge.h"
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
#endif
#ifdef HAVE_ALTIVEC_H
# include <altivec.h>
#endif
@ -67,32 +63,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
#if defined(CAN_COMPILE_MMXEXT)
VLC_MMX
void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
size_t i_bytes )
{
uint8_t *p_dest = _p_dest;
const uint8_t *p_s1 = _p_s1;
const uint8_t *p_s2 = _p_s2;
for( ; i_bytes >= 8; i_bytes -= 8 )
{
__asm__ __volatile__( "movq %2,%%mm1;"
"pavgb %1, %%mm1;"
"movq %%mm1, %0" :"=m" (*p_dest):
"m" (*p_s1),
"m" (*p_s2) : "mm1" );
p_dest += 8;
p_s1 += 8;
p_s2 += 8;
}
for( ; i_bytes > 0; i_bytes-- )
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
#endif
#if defined(CAN_COMPILE_SSE)
VLC_SSE
void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
@ -223,8 +193,8 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
* EndMerge routines
*****************************************************************************/
#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
void EndMMX( void )
#if defined(CAN_COMPILE_SSE2)
void EndSSE( void )
{
__asm__ __volatile__( "emms" :: );
}

View File

@ -112,18 +112,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
void MergeAltivec ( void *, const void *, const void *, size_t );
#endif
#if defined(CAN_COMPILE_MMXEXT)
/**
* MMXEXT routine to blend pixels from two picture lines.
*
* @param _p_dest Target
* @param _p_s1 Source line A
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
void MergeMMXEXT ( void *, const void *, const void *, size_t );
#endif
#if defined(CAN_COMPILE_SSE)
/**
* SSE2 routine to blend pixels from two picture lines.
@ -175,17 +163,17 @@ void merge16_arm_sve(void *, const void *, const void *, size_t);
* EndMerge routines
*****************************************************************************/
#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
#if defined(CAN_COMPILE_SSE2)
/**
* MMX merge finalization routine.
* SSE merge finalization routine.
*
* Must be called after an MMX merge is finished.
* This exits MMX mode (by executing the "emms" instruction).
* Should be called after an SSE merge is finished.
* This exits SSE mode (by executing the "emms" instruction).
*
* The EndMerge() macro detects whether this is needed, and calls if it is,
* so just use that.
*/
void EndMMX ( void );
void EndSSE( void );
#endif
#endif

View File

@ -1,256 +0,0 @@
/*
* mmx.h
* Copyright (C) 1997-1999 H. Dietz and R. Fisher
*
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
*
* mpeg2dec is free software; you can redistribute it and/or modify
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* The type of an value that fits in an MMX register (note that long
* long constant values MUST be suffixed by LL and unsigned long long
* values by ULL, lest they be truncated by the compiler)
*/
#include <stdint.h>
typedef union {
int64_t q; /* Quadword (64-bit) value */
uint64_t uq; /* Unsigned Quadword */
int32_t d[2]; /* 2 Doubleword (32-bit) values */
uint32_t ud[2]; /* 2 Unsigned Doubleword */
int16_t w[4]; /* 4 Word (16-bit) values */
uint16_t uw[4]; /* 4 Unsigned Word */
int8_t b[8]; /* 8 Byte (8-bit) values */
uint8_t ub[8]; /* 8 Unsigned Byte */
float s[2]; /* Single-precision (32-bit) value */
} mmx_t; /* NOTE: must be on an 8-byte (64-bit) boundary */
#define mmx_i2r(op,imm,reg) \
__asm__ __volatile__ (#op " %0, %%" #reg \
: /* nothing */ \
: "i" (imm) \
: #reg)
#define mmx_m2r(op,mem,reg) \
__asm__ __volatile__ (#op " %0, %%" #reg \
: /* nothing */ \
: "m" (mem) \
: #reg)
#define mmx_r2m(op,reg,mem) \
__asm__ __volatile__ (#op " %%" #reg ", %0" \
: "=m" (mem) \
: /* nothing */ \
: "memory")
#define mmx_r2r(op,regs,regd) \
__asm__ __volatile__ (#op " %%" #regs ", %%" #regd ::: #regd)
#define emms() __asm__ __volatile__ ("emms")
#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
#define por_m2r(var,reg) mmx_m2r (por, var, reg)
#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
/* AMD MMX extensions - also available in intel SSE */
#define mmx_m2ri(op,mem,reg,imm) \
__asm__ __volatile__ (#op " %1, %0, %%" #reg \
: /* nothing */ \
: "X" (mem), "X" (imm) \
: #reg)
#define mmx_r2ri(op,regs,regd,imm) \
__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
: /* nothing */ \
: "X" (imm) \
: #regd)
#define mmx_fetch(mem,hint) \
__asm__ __volatile__ ("prefetch" #hint " %0" \
: /* nothing */ \
: "X" (mem))
#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
#define pmovmskb(mmreg,reg) \
__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg : : : #reg)
#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
#define prefetcht0(mem) mmx_fetch (mem, t0)
#define prefetcht1(mem) mmx_fetch (mem, t1)
#define prefetcht2(mem) mmx_fetch (mem, t2)
#define prefetchnta(mem) mmx_fetch (mem, nta)
#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
#define sfence() __asm__ __volatile__ ("sfence\n\t")

View File

@ -97,6 +97,3 @@ static void yadif_filter_line_c_16bit(uint8_t *dst8, uint8_t *prev8, uint8_t *cu
void vlcpriv_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
void vlcpriv_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
#endif
#if defined(__i386__)
void vlcpriv_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
#endif

View File

@ -248,9 +248,6 @@ cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
FILTER 0, curq, nextq
.ret:
%if mmsize == 8
emms
%endif
RET
%if ARCH_X86_32
%undef pb_1
@ -262,7 +259,3 @@ INIT_XMM ssse3
YADIF
INIT_XMM sse2
YADIF
%if ARCH_X86_32
INIT_MMX mmxext
YADIF
%endif