mirror of
https://code.videolan.org/videolan/vlc
synced 2024-10-03 01:31:53 +02:00
deinterlace: purge MMX/MMXEXT
notes: - this removes all SIMD acceleration for x86/x86_64. originally this work started by converting the MMX code to SSE2, then purged remaining artifacts, but a build error on android has blocked that work from being merged for now. this commit thus takes a different approach of simply purging the old MMX/MMXEXT code first, with getting the SSE2 implementation working to be done as a follow up. - the `EndMMX()` function is retained (renamed to `EndSSE()`) because it is still used under the merge code. the `emms` instruction will be replaced with an `sfence` instruction separately, as more appropriate.
This commit is contained in:
parent
e6bb48cc15
commit
831fa3cd0f
@ -159,7 +159,6 @@ noinst_LTLIBRARIES += libdeinterlace_common.la
|
||||
|
||||
libdeinterlace_plugin_la_SOURCES = \
|
||||
video_filter/deinterlace/deinterlace.c video_filter/deinterlace/deinterlace.h \
|
||||
video_filter/deinterlace/mmx.h \
|
||||
video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
|
||||
video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
|
||||
video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \
|
||||
|
@ -24,10 +24,6 @@
|
||||
# include "config.h"
|
||||
#endif
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
# include "mmx.h"
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
@ -24,11 +24,6 @@
|
||||
# include "config.h"
|
||||
#endif
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
# include "mmx.h"
|
||||
# include <stdalign.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
@ -87,7 +82,7 @@ static void DarkenField( picture_t *p_dst,
|
||||
For luma, the operation is just a shift + bitwise AND, so we vectorize
|
||||
even in the C version.
|
||||
|
||||
There is an MMX version too, because it performs about twice faster.
|
||||
There are SIMD versions too, which perform significantly faster.
|
||||
*/
|
||||
int i_plane = Y_PLANE;
|
||||
uint8_t *p_out, *p_out_end;
|
||||
@ -120,7 +115,7 @@ static void DarkenField( picture_t *p_dst,
|
||||
|
||||
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
|
||||
The chroma processing is a bit more complicated than luma,
|
||||
and needs MMX for vectorization.
|
||||
and needs SIMD for vectorization.
|
||||
*/
|
||||
if( process_chroma )
|
||||
{
|
||||
@ -148,129 +143,6 @@ static void DarkenField( picture_t *p_dst,
|
||||
} /* if process_chroma */
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static void DarkenFieldMMX( picture_t *p_dst,
|
||||
const int i_field, const int i_strength,
|
||||
bool process_chroma )
|
||||
{
|
||||
assert( p_dst != NULL );
|
||||
assert( i_field == 0 || i_field == 1 );
|
||||
assert( i_strength >= 1 && i_strength <= 3 );
|
||||
|
||||
uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
|
||||
const uint8_t remove_high_u8 = 0xFF >> i_strength;
|
||||
const uint64_t remove_high_u64 = remove_high_u8 *
|
||||
INT64_C(0x0101010101010101);
|
||||
|
||||
int i_plane = Y_PLANE;
|
||||
uint8_t *p_out, *p_out_end;
|
||||
int w = p_dst->p[i_plane].i_visible_pitch;
|
||||
p_out = p_dst->p[i_plane].p_pixels;
|
||||
p_out_end = p_out + p_dst->p[i_plane].i_pitch
|
||||
* p_dst->p[i_plane].i_visible_lines;
|
||||
|
||||
/* skip first line for bottom field */
|
||||
if( i_field == 1 )
|
||||
p_out += p_dst->p[i_plane].i_pitch;
|
||||
|
||||
int wm8 = w % 8; /* remainder */
|
||||
int w8 = w - wm8; /* part of width that is divisible by 8 */
|
||||
for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
|
||||
{
|
||||
uint64_t *po = (uint64_t *)p_out;
|
||||
int x = 0;
|
||||
|
||||
movq_m2r( i_strength_u64, mm1 );
|
||||
movq_m2r( remove_high_u64, mm2 );
|
||||
for( ; x < w8; x += 8 )
|
||||
{
|
||||
movq_m2r( (*po), mm0 );
|
||||
|
||||
psrlq_r2r( mm1, mm0 );
|
||||
pand_r2r( mm2, mm0 );
|
||||
|
||||
movq_r2m( mm0, (*po++) );
|
||||
}
|
||||
|
||||
/* handle the width remainder */
|
||||
uint8_t *po_temp = (uint8_t *)po;
|
||||
for( ; x < w; ++x, ++po_temp )
|
||||
(*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
|
||||
}
|
||||
|
||||
/* Process chroma if the field chromas are independent.
|
||||
|
||||
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
|
||||
The chroma processing is a bit more complicated than luma,
|
||||
and needs MMX for vectorization.
|
||||
*/
|
||||
if( process_chroma )
|
||||
{
|
||||
for( i_plane++ /* luma already handled */;
|
||||
i_plane < p_dst->i_planes;
|
||||
i_plane++ )
|
||||
{
|
||||
w = p_dst->p[i_plane].i_visible_pitch;
|
||||
wm8 = w % 8; /* remainder */
|
||||
w8 = w - wm8; /* part of width that is divisible by 8 */
|
||||
|
||||
p_out = p_dst->p[i_plane].p_pixels;
|
||||
p_out_end = p_out + p_dst->p[i_plane].i_pitch
|
||||
* p_dst->p[i_plane].i_visible_lines;
|
||||
|
||||
/* skip first line for bottom field */
|
||||
if( i_field == 1 )
|
||||
p_out += p_dst->p[i_plane].i_pitch;
|
||||
|
||||
for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
/* See also easy-to-read C version below. */
|
||||
static alignas (8) const mmx_t b128 = {
|
||||
.uq = 0x8080808080808080ULL
|
||||
};
|
||||
|
||||
movq_m2r( b128, mm5 );
|
||||
movq_m2r( i_strength_u64, mm6 );
|
||||
movq_m2r( remove_high_u64, mm7 );
|
||||
|
||||
uint64_t *po8 = (uint64_t *)p_out;
|
||||
for( ; x < w8; x += 8 )
|
||||
{
|
||||
movq_m2r( (*po8), mm0 );
|
||||
|
||||
movq_r2r( mm5, mm2 ); /* 128 */
|
||||
movq_r2r( mm0, mm1 ); /* copy of data */
|
||||
psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
|
||||
psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
|
||||
|
||||
/* >> i_strength */
|
||||
psrlq_r2r( mm6, mm1 );
|
||||
psrlq_r2r( mm6, mm2 );
|
||||
pand_r2r( mm7, mm1 );
|
||||
pand_r2r( mm7, mm2 );
|
||||
|
||||
/* collect results from pos./neg. parts */
|
||||
psubb_r2r( mm2, mm1 );
|
||||
paddb_r2r( mm5, mm1 );
|
||||
|
||||
movq_r2m( mm1, (*po8++) );
|
||||
}
|
||||
|
||||
/* C version - handle the width remainder */
|
||||
uint8_t *po = p_out;
|
||||
for( ; x < w; ++x, ++po )
|
||||
(*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
|
||||
} /* for p_out... */
|
||||
} /* for i_plane... */
|
||||
} /* if process_chroma */
|
||||
|
||||
emms();
|
||||
}
|
||||
#endif
|
||||
|
||||
/*****************************************************************************
|
||||
* Public functions
|
||||
*****************************************************************************/
|
||||
@ -357,13 +229,6 @@ int RenderPhosphor( filter_t *p_filter,
|
||||
*/
|
||||
if( p_sys->phosphor.i_dimmer_strength > 0 )
|
||||
{
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
if( vlc_CPU_MMXEXT() )
|
||||
DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
|
||||
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
|
||||
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
|
||||
else
|
||||
#endif
|
||||
DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
|
||||
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
|
||||
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
|
||||
|
@ -24,10 +24,6 @@
|
||||
# include "config.h"
|
||||
#endif
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
# include "mmx.h"
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <vlc_common.h>
|
||||
@ -76,71 +72,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
|
||||
|
||||
return fc < 1 ? false : true;
|
||||
}
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
|
||||
{
|
||||
|
||||
int y, x;
|
||||
int32_t ff, fr;
|
||||
int fc;
|
||||
|
||||
/* Detect interlacing */
|
||||
fc = 0;
|
||||
pxor_r2r( mm7, mm7 );
|
||||
for( y = 0; y < 9; y += 2 )
|
||||
{
|
||||
ff = fr = 0;
|
||||
pxor_r2r( mm5, mm5 );
|
||||
pxor_r2r( mm6, mm6 );
|
||||
for( x = 0; x < 8; x+=4 )
|
||||
{
|
||||
movd_m2r( src[ x], mm0 );
|
||||
movd_m2r( src[1*i_src+x], mm1 );
|
||||
movd_m2r( src[2*i_src+x], mm2 );
|
||||
movd_m2r( src[3*i_src+x], mm3 );
|
||||
|
||||
punpcklbw_r2r( mm7, mm0 );
|
||||
punpcklbw_r2r( mm7, mm1 );
|
||||
punpcklbw_r2r( mm7, mm2 );
|
||||
punpcklbw_r2r( mm7, mm3 );
|
||||
|
||||
movq_r2r( mm0, mm4 );
|
||||
|
||||
psubw_r2r( mm1, mm0 );
|
||||
psubw_r2r( mm2, mm4 );
|
||||
|
||||
psubw_r2r( mm1, mm2 );
|
||||
psubw_r2r( mm1, mm3 );
|
||||
|
||||
pmaddwd_r2r( mm0, mm0 );
|
||||
pmaddwd_r2r( mm4, mm4 );
|
||||
pmaddwd_r2r( mm2, mm2 );
|
||||
pmaddwd_r2r( mm3, mm3 );
|
||||
paddd_r2r( mm0, mm2 );
|
||||
paddd_r2r( mm4, mm3 );
|
||||
paddd_r2r( mm2, mm5 );
|
||||
paddd_r2r( mm3, mm6 );
|
||||
}
|
||||
|
||||
movq_r2r( mm5, mm0 );
|
||||
psrlq_i2r( 32, mm0 );
|
||||
paddd_r2r( mm0, mm5 );
|
||||
movd_r2m( mm5, fr );
|
||||
|
||||
movq_r2r( mm6, mm0 );
|
||||
psrlq_i2r( 32, mm0 );
|
||||
paddd_r2r( mm0, mm6 );
|
||||
movd_r2m( mm6, ff );
|
||||
|
||||
if( ff < 6*fr/8 && fr > 32 )
|
||||
fc++;
|
||||
|
||||
src += 2*i_src;
|
||||
}
|
||||
return fc;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
|
||||
uint8_t *src1, int i_src1,
|
||||
@ -163,49 +94,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
|
||||
uint8_t *src1, int i_src1,
|
||||
uint8_t *src2, int i_src2 )
|
||||
{
|
||||
static const uint64_t m_4 = INT64_C(0x0004000400040004);
|
||||
int y, x;
|
||||
|
||||
/* Progressive */
|
||||
pxor_r2r( mm7, mm7 );
|
||||
for( y = 0; y < 8; y += 2 )
|
||||
{
|
||||
for( x = 0; x < 8; x +=4 )
|
||||
{
|
||||
movd_m2r( src1[x], mm0 );
|
||||
movd_r2m( mm0, dst[x] );
|
||||
|
||||
movd_m2r( src2[x], mm1 );
|
||||
movd_m2r( src1[i_src1+x], mm2 );
|
||||
|
||||
punpcklbw_r2r( mm7, mm0 );
|
||||
punpcklbw_r2r( mm7, mm1 );
|
||||
punpcklbw_r2r( mm7, mm2 );
|
||||
paddw_r2r( mm1, mm1 );
|
||||
movq_r2r( mm1, mm3 );
|
||||
paddw_r2r( mm3, mm3 );
|
||||
paddw_r2r( mm2, mm0 );
|
||||
paddw_r2r( mm3, mm1 );
|
||||
paddw_m2r( m_4, mm1 );
|
||||
paddw_r2r( mm1, mm0 );
|
||||
psraw_i2r( 3, mm0 );
|
||||
packuswb_r2r( mm7, mm0 );
|
||||
movd_r2m( mm0, dst[i_dst+x] );
|
||||
}
|
||||
dst += 2*i_dst;
|
||||
src1 += i_src1;
|
||||
src2 += i_src2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
|
||||
* neighbour
|
||||
* (Use 8x9 pixels)
|
||||
@ -229,31 +117,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
|
||||
uint8_t *src, int i_src )
|
||||
{
|
||||
int y;
|
||||
|
||||
/* Interlaced */
|
||||
for( y = 0; y < 8; y += 2 )
|
||||
{
|
||||
movq_m2r( src[0], mm0 );
|
||||
movq_r2m( mm0, dst[0] );
|
||||
dst += i_dst;
|
||||
|
||||
movq_m2r( src[2*i_src], mm1 );
|
||||
pavgb_r2r( mm1, mm0 );
|
||||
|
||||
movq_r2m( mm0, dst[0] );
|
||||
|
||||
dst += 1*i_dst;
|
||||
src += 2*i_src;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* XDeint8x8Field: Edge oriented interpolation
|
||||
* (Need -4 and +5 pixels H, +1 line)
|
||||
*/
|
||||
@ -271,7 +134,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
|
||||
for( x = 0; x < 8; x++ )
|
||||
{
|
||||
uint8_t *src2 = &src[2*i_src];
|
||||
/* I use 8 pixels just to match the MMX version, but it's overkill
|
||||
/* I use 8 pixels just to match the SIMD version, but it's overkill
|
||||
* 5 would be enough (less isn't good) */
|
||||
const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
|
||||
abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
|
||||
@ -301,50 +164,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
|
||||
uint8_t *src, int i_src )
|
||||
{
|
||||
int y, x;
|
||||
|
||||
/* Interlaced */
|
||||
for( y = 0; y < 8; y += 2 )
|
||||
{
|
||||
memcpy( dst, src, 8 );
|
||||
dst += i_dst;
|
||||
|
||||
for( x = 0; x < 8; x++ )
|
||||
{
|
||||
uint8_t *src2 = &src[2*i_src];
|
||||
int32_t c0, c1, c2;
|
||||
|
||||
movq_m2r( src[x-2], mm0 );
|
||||
movq_m2r( src[x-3], mm1 );
|
||||
movq_m2r( src[x-4], mm2 );
|
||||
|
||||
psadbw_m2r( src2[x-4], mm0 );
|
||||
psadbw_m2r( src2[x-3], mm1 );
|
||||
psadbw_m2r( src2[x-2], mm2 );
|
||||
|
||||
movd_r2m( mm0, c2 );
|
||||
movd_r2m( mm1, c1 );
|
||||
movd_r2m( mm2, c0 );
|
||||
|
||||
if( c0 < c1 && c1 <= c2 )
|
||||
dst[x] = (src[x-1] + src2[x+1]) >> 1;
|
||||
else if( c2 < c1 && c1 <= c0 )
|
||||
dst[x] = (src[x+1] + src2[x-1]) >> 1;
|
||||
else
|
||||
dst[x] = (src[x+0] + src2[x+0]) >> 1;
|
||||
}
|
||||
|
||||
dst += 1*i_dst;
|
||||
src += 2*i_src;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* NxN arbitray size (and then only use pixel in the NxN block)
|
||||
*/
|
||||
static inline int XDeintNxNDetect( uint8_t *src, int i_src,
|
||||
@ -472,41 +291,6 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
|
||||
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
|
||||
uint8_t *src, int i_src,
|
||||
const int i_mbx, int i_modx )
|
||||
{
|
||||
int x;
|
||||
|
||||
/* Reset current line */
|
||||
for( x = 0; x < i_mbx; x++ )
|
||||
{
|
||||
int s;
|
||||
if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
|
||||
{
|
||||
if( x == 0 || x == i_mbx - 1 )
|
||||
XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
|
||||
else
|
||||
XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
|
||||
}
|
||||
else
|
||||
{
|
||||
XDeint8x8MergeMMXEXT( dst, i_dst,
|
||||
&src[0*i_src], 2*i_src,
|
||||
&src[1*i_src], 2*i_src );
|
||||
}
|
||||
|
||||
dst += 8;
|
||||
src += 8;
|
||||
}
|
||||
|
||||
if( i_modx )
|
||||
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
|
||||
}
|
||||
#endif
|
||||
|
||||
/*****************************************************************************
|
||||
* Public functions
|
||||
*****************************************************************************/
|
||||
@ -515,9 +299,6 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
|
||||
{
|
||||
VLC_UNUSED(p_filter);
|
||||
int i_plane;
|
||||
#if defined (CAN_COMPILE_MMXEXT)
|
||||
const bool mmxext = vlc_CPU_MMXEXT();
|
||||
#endif
|
||||
|
||||
/* Copy image and skip lines */
|
||||
for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
|
||||
@ -538,12 +319,7 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
|
||||
uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
|
||||
uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
if( mmxext )
|
||||
XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
|
||||
else
|
||||
#endif
|
||||
XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
|
||||
XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
|
||||
}
|
||||
|
||||
/* Last line (C only)*/
|
||||
@ -565,9 +341,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
if( mmxext )
|
||||
emms();
|
||||
#endif
|
||||
return VLC_SUCCESS;
|
||||
}
|
||||
|
@ -33,13 +33,13 @@ struct picture_t;
|
||||
/**
|
||||
* Interpolating deinterlace filter "X".
|
||||
*
|
||||
* The algorithm works on a 8x8 block basic, it copies the top field
|
||||
* The algorithm works on a 8x8 block basis; It copies the top field
|
||||
* and applies a process to recreate the bottom field.
|
||||
*
|
||||
* If a 8x8 block is classified as :
|
||||
* - progressive: it applies a small blend (1,6,1)
|
||||
* - interlaced:
|
||||
* * in the MMX version: we do a ME between the 2 fields, if there is a
|
||||
* * in the SIMD version: we do a ME between the 2 fields, if there is a
|
||||
* good match we use MC to recreate the bottom field (with a small
|
||||
* blend (1,6,1) )
|
||||
* * otherwise: it recreates the bottom field by an edge oriented
|
||||
|
@ -119,11 +119,6 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
|
||||
if( vlc_CPU_SSE2() )
|
||||
filter = vlcpriv_yadif_filter_line_sse2;
|
||||
else
|
||||
#if defined(__i386__)
|
||||
if( vlc_CPU_MMXEXT() )
|
||||
filter = vlcpriv_yadif_filter_line_mmxext;
|
||||
else
|
||||
#endif
|
||||
#endif
|
||||
filter = yadif_filter_line_c;
|
||||
|
||||
|
@ -558,15 +558,7 @@ notsupp:
|
||||
if( vlc_CPU_SSE2() )
|
||||
{
|
||||
p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
|
||||
p_sys->pf_end_merge = EndMMX;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if defined(CAN_COMPILE_MMXEXT)
|
||||
if( pixel_size == 1 && vlc_CPU_MMXEXT() )
|
||||
{
|
||||
p_sys->pf_merge = MergeMMXEXT;
|
||||
p_sys->pf_end_merge = EndMMX;
|
||||
p_sys->pf_end_merge = EndSSE;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
@ -68,7 +68,7 @@ typedef struct
|
||||
{
|
||||
const vlc_chroma_description_t *chroma;
|
||||
|
||||
/** Merge routine: C, MMX, SSE, ALTIVEC, NEON, ... */
|
||||
/** Merge routine: C, SSE, ALTIVEC, NEON, ... */
|
||||
void (*pf_merge) ( void *, const void *, const void *, size_t );
|
||||
#if defined (__i386__) || defined (__x86_64__)
|
||||
/** Merge finalization routine for SSE */
|
||||
|
@ -24,11 +24,6 @@
|
||||
# include "config.h"
|
||||
#endif
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
# include "mmx.h"
|
||||
# include <stdalign.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
@ -107,9 +102,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
|
||||
* For interpretation of pi_top and pi_bot, it is assumed that the block
|
||||
* starts on an even-numbered line (belonging to the top field).
|
||||
*
|
||||
* The b_mmx parameter avoids the need to call vlc_CPU() separately
|
||||
* for each block.
|
||||
*
|
||||
* @param[in] p_pix_p Base pointer to the block in previous picture
|
||||
* @param[in] p_pix_c Base pointer to the same block in current picture
|
||||
* @param i_pitch_prev i_pitch of previous picture
|
||||
@ -172,79 +164,6 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
|
||||
changes "enough". */
|
||||
return (i_motion >= 8);
|
||||
}
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
|
||||
int i_pitch_prev, int i_pitch_curr,
|
||||
int* pi_top, int* pi_bot )
|
||||
{
|
||||
int32_t i_motion = 0;
|
||||
int32_t i_top_motion = 0;
|
||||
int32_t i_bot_motion = 0;
|
||||
|
||||
static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
|
||||
pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
|
||||
movq_m2r( bT, mm5 );
|
||||
|
||||
pxor_r2r( mm3, mm3 ); /* score (top field) */
|
||||
pxor_r2r( mm4, mm4 ); /* score (bottom field) */
|
||||
for( int y = 0; y < 8; y+=2 )
|
||||
{
|
||||
/* top field */
|
||||
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
|
||||
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
|
||||
movq_r2r( mm0, mm2 );
|
||||
psubusb_r2r( mm1, mm2 );
|
||||
psubusb_r2r( mm0, mm1 );
|
||||
|
||||
pcmpgtb_r2r( mm5, mm2 );
|
||||
pcmpgtb_r2r( mm5, mm1 );
|
||||
psadbw_r2r( mm6, mm2 );
|
||||
psadbw_r2r( mm6, mm1 );
|
||||
|
||||
paddd_r2r( mm2, mm1 );
|
||||
paddd_r2r( mm1, mm3 ); /* add to top field score */
|
||||
|
||||
p_pix_c += i_pitch_curr;
|
||||
p_pix_p += i_pitch_prev;
|
||||
|
||||
/* bottom field - handling identical to top field, except... */
|
||||
movq_m2r( *((uint64_t*)p_pix_c), mm0 );
|
||||
movq_m2r( *((uint64_t*)p_pix_p), mm1 );
|
||||
movq_r2r( mm0, mm2 );
|
||||
psubusb_r2r( mm1, mm2 );
|
||||
psubusb_r2r( mm0, mm1 );
|
||||
|
||||
pcmpgtb_r2r( mm5, mm2 );
|
||||
pcmpgtb_r2r( mm5, mm1 );
|
||||
psadbw_r2r( mm6, mm2 );
|
||||
psadbw_r2r( mm6, mm1 );
|
||||
|
||||
paddd_r2r( mm2, mm1 );
|
||||
paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
|
||||
|
||||
p_pix_c += i_pitch_curr;
|
||||
p_pix_p += i_pitch_prev;
|
||||
}
|
||||
movq_r2r( mm3, mm7 ); /* score (total) */
|
||||
paddd_r2r( mm4, mm7 );
|
||||
movd_r2m( mm3, i_top_motion );
|
||||
movd_r2m( mm4, i_bot_motion );
|
||||
movd_r2m( mm7, i_motion );
|
||||
|
||||
/* The loop counts actual score * 255. */
|
||||
i_top_motion /= 255;
|
||||
i_bot_motion /= 255;
|
||||
i_motion /= 255;
|
||||
|
||||
emms();
|
||||
|
||||
(*pi_top) = ( i_top_motion >= 8 );
|
||||
(*pi_bot) = ( i_bot_motion >= 8 );
|
||||
return (i_motion >= 8);
|
||||
}
|
||||
#endif
|
||||
#undef T
|
||||
|
||||
/*****************************************************************************
|
||||
@ -396,11 +315,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
|
||||
|
||||
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
|
||||
TestForMotionInBlock;
|
||||
/* We must tell our inline helper whether to use MMX acceleration. */
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
if (vlc_CPU_MMXEXT())
|
||||
motion_in_block = TestForMotionInBlockMMX;
|
||||
#endif
|
||||
|
||||
int i_score = 0;
|
||||
for( int i_plane = 0 ; i_plane < p_prev->i_planes ; i_plane++ )
|
||||
@ -451,142 +365,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
|
||||
/* Threshold (value from Transcode 1.1.5) */
|
||||
#define T 100
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
VLC_MMX
|
||||
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
|
||||
const picture_t* p_pic_bot )
|
||||
{
|
||||
assert( p_pic_top->i_planes == p_pic_bot->i_planes );
|
||||
|
||||
/* Amount of bits must be known for MMX, thus int32_t.
|
||||
Doesn't hurt the C implementation. */
|
||||
int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
|
||||
int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
|
||||
|
||||
pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
|
||||
|
||||
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
|
||||
{
|
||||
/* Sanity check */
|
||||
if( p_pic_top->p[i_plane].i_visible_lines !=
|
||||
p_pic_bot->p[i_plane].i_visible_lines )
|
||||
return -1;
|
||||
|
||||
const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
|
||||
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
|
||||
p_pic_bot->p[i_plane].i_visible_pitch );
|
||||
const int wm8 = w % 8; /* remainder */
|
||||
const int w8 = w - wm8; /* part of width that is divisible by 8 */
|
||||
|
||||
/* Current line / neighbouring lines picture pointers */
|
||||
const picture_t *cur = p_pic_bot;
|
||||
const picture_t *ngh = p_pic_top;
|
||||
int wc = cur->p[i_plane].i_pitch;
|
||||
int wn = ngh->p[i_plane].i_pitch;
|
||||
|
||||
/* Transcode 1.1.5 only checks every other line. Checking every line
|
||||
works better for anime, which may contain horizontal,
|
||||
one pixel thick cartoon outlines.
|
||||
*/
|
||||
for( int y = 1; y < i_lasty; ++y )
|
||||
{
|
||||
uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
|
||||
uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
|
||||
uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
|
||||
|
||||
int x = 0;
|
||||
|
||||
/* Easy-to-read C version further below.
|
||||
|
||||
Assumptions: 0 < T < 127
|
||||
# of pixels < (2^32)/255
|
||||
Note: calculates score * 255
|
||||
*/
|
||||
static alignas (8) const mmx_t b0 = {
|
||||
.uq = 0x0000000000000000ULL };
|
||||
static alignas (8) const mmx_t b128 = {
|
||||
.uq = 0x8080808080808080ULL };
|
||||
static alignas (8) const mmx_t bT = {
|
||||
.ub = { T, T, T, T, T, T, T, T } };
|
||||
|
||||
for( ; x < w8; x += 8 )
|
||||
{
|
||||
movq_m2r( *((int64_t*)p_c), mm0 );
|
||||
movq_m2r( *((int64_t*)p_p), mm1 );
|
||||
movq_m2r( *((int64_t*)p_n), mm2 );
|
||||
|
||||
psubb_m2r( b128, mm0 );
|
||||
psubb_m2r( b128, mm1 );
|
||||
psubb_m2r( b128, mm2 );
|
||||
|
||||
psubsb_r2r( mm0, mm1 );
|
||||
psubsb_r2r( mm0, mm2 );
|
||||
|
||||
pxor_r2r( mm3, mm3 );
|
||||
pxor_r2r( mm4, mm4 );
|
||||
pxor_r2r( mm5, mm5 );
|
||||
pxor_r2r( mm6, mm6 );
|
||||
|
||||
punpcklbw_r2r( mm1, mm3 );
|
||||
punpcklbw_r2r( mm2, mm4 );
|
||||
punpckhbw_r2r( mm1, mm5 );
|
||||
punpckhbw_r2r( mm2, mm6 );
|
||||
|
||||
pmulhw_r2r( mm3, mm4 );
|
||||
pmulhw_r2r( mm5, mm6 );
|
||||
|
||||
packsswb_r2r(mm4, mm6);
|
||||
pcmpgtb_m2r( bT, mm6 );
|
||||
psadbw_m2r( b0, mm6 );
|
||||
paddd_r2r( mm6, mm7 );
|
||||
|
||||
p_c += 8;
|
||||
p_p += 8;
|
||||
p_n += 8;
|
||||
}
|
||||
|
||||
for( ; x < w; ++x )
|
||||
{
|
||||
/* Worst case: need 17 bits for "comb". */
|
||||
int_fast32_t C = *p_c;
|
||||
int_fast32_t P = *p_p;
|
||||
int_fast32_t N = *p_n;
|
||||
|
||||
/* Comments in Transcode's filter_ivtc.c attribute this
|
||||
combing metric to Gunnar Thalin.
|
||||
|
||||
The idea is that if the picture is interlaced, both
|
||||
expressions will have the same sign, and this comes
|
||||
up positive. The value T = 100 has been chosen such
|
||||
that a pixel difference of 10 (on average) will
|
||||
trigger the detector.
|
||||
*/
|
||||
int_fast32_t comb = (P - C) * (N - C);
|
||||
if( comb > T )
|
||||
++i_score_c;
|
||||
|
||||
++p_c;
|
||||
++p_p;
|
||||
++p_n;
|
||||
}
|
||||
|
||||
/* Now the other field - swap current and neighbour pictures */
|
||||
const picture_t *tmp = cur;
|
||||
cur = ngh;
|
||||
ngh = tmp;
|
||||
int tmp_pitch = wc;
|
||||
wc = wn;
|
||||
wn = tmp_pitch;
|
||||
}
|
||||
}
|
||||
|
||||
movd_r2m( mm7, i_score_mmx );
|
||||
emms();
|
||||
|
||||
return i_score_mmx/255 + i_score_c;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for function doc. */
|
||||
int CalculateInterlaceScore( const picture_t* p_pic_top,
|
||||
const picture_t* p_pic_bot )
|
||||
@ -607,11 +385,6 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
|
||||
if( p_pic_top->i_planes != p_pic_bot->i_planes )
|
||||
return -1;
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
if (vlc_CPU_MMXEXT())
|
||||
return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
|
||||
#endif
|
||||
|
||||
int32_t i_score = 0;
|
||||
|
||||
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
|
||||
|
@ -33,10 +33,6 @@
|
||||
#include <vlc_cpu.h>
|
||||
#include "merge.h"
|
||||
|
||||
#ifdef CAN_COMPILE_MMXEXT
|
||||
# include "mmx.h"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ALTIVEC_H
|
||||
# include <altivec.h>
|
||||
#endif
|
||||
@ -67,32 +63,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
|
||||
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
|
||||
}
|
||||
|
||||
#if defined(CAN_COMPILE_MMXEXT)
|
||||
VLC_MMX
|
||||
void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
|
||||
size_t i_bytes )
|
||||
{
|
||||
uint8_t *p_dest = _p_dest;
|
||||
const uint8_t *p_s1 = _p_s1;
|
||||
const uint8_t *p_s2 = _p_s2;
|
||||
|
||||
for( ; i_bytes >= 8; i_bytes -= 8 )
|
||||
{
|
||||
__asm__ __volatile__( "movq %2,%%mm1;"
|
||||
"pavgb %1, %%mm1;"
|
||||
"movq %%mm1, %0" :"=m" (*p_dest):
|
||||
"m" (*p_s1),
|
||||
"m" (*p_s2) : "mm1" );
|
||||
p_dest += 8;
|
||||
p_s1 += 8;
|
||||
p_s2 += 8;
|
||||
}
|
||||
|
||||
for( ; i_bytes > 0; i_bytes-- )
|
||||
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CAN_COMPILE_SSE)
|
||||
VLC_SSE
|
||||
void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
|
||||
@ -223,8 +193,8 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
|
||||
* EndMerge routines
|
||||
*****************************************************************************/
|
||||
|
||||
#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
|
||||
void EndMMX( void )
|
||||
#if defined(CAN_COMPILE_SSE2)
|
||||
void EndSSE( void )
|
||||
{
|
||||
__asm__ __volatile__( "emms" :: );
|
||||
}
|
||||
|
@ -112,18 +112,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
|
||||
void MergeAltivec ( void *, const void *, const void *, size_t );
|
||||
#endif
|
||||
|
||||
#if defined(CAN_COMPILE_MMXEXT)
|
||||
/**
|
||||
* MMXEXT routine to blend pixels from two picture lines.
|
||||
*
|
||||
* @param _p_dest Target
|
||||
* @param _p_s1 Source line A
|
||||
* @param _p_s2 Source line B
|
||||
* @param i_bytes Number of bytes to merge
|
||||
*/
|
||||
void MergeMMXEXT ( void *, const void *, const void *, size_t );
|
||||
#endif
|
||||
|
||||
#if defined(CAN_COMPILE_SSE)
|
||||
/**
|
||||
* SSE2 routine to blend pixels from two picture lines.
|
||||
@ -175,17 +163,17 @@ void merge16_arm_sve(void *, const void *, const void *, size_t);
|
||||
* EndMerge routines
|
||||
*****************************************************************************/
|
||||
|
||||
#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
|
||||
#if defined(CAN_COMPILE_SSE2)
|
||||
/**
|
||||
* MMX merge finalization routine.
|
||||
* SSE merge finalization routine.
|
||||
*
|
||||
* Must be called after an MMX merge is finished.
|
||||
* This exits MMX mode (by executing the "emms" instruction).
|
||||
* Should be called after an SSE merge is finished.
|
||||
* This exits SSE mode (by executing the "emms" instruction).
|
||||
*
|
||||
* The EndMerge() macro detects whether this is needed, and calls if it is,
|
||||
* so just use that.
|
||||
*/
|
||||
void EndMMX ( void );
|
||||
void EndSSE( void );
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -1,256 +0,0 @@
|
||||
/*
|
||||
* mmx.h
|
||||
* Copyright (C) 1997-1999 H. Dietz and R. Fisher
|
||||
*
|
||||
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
|
||||
*
|
||||
* mpeg2dec is free software; you can redistribute it and/or modify
|
||||
* under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation; either version 2.1 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* mpeg2dec is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program; if not, write to the Free Software Foundation,
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* The type of an value that fits in an MMX register (note that long
|
||||
* long constant values MUST be suffixed by LL and unsigned long long
|
||||
* values by ULL, lest they be truncated by the compiler)
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef union {
|
||||
int64_t q; /* Quadword (64-bit) value */
|
||||
uint64_t uq; /* Unsigned Quadword */
|
||||
int32_t d[2]; /* 2 Doubleword (32-bit) values */
|
||||
uint32_t ud[2]; /* 2 Unsigned Doubleword */
|
||||
int16_t w[4]; /* 4 Word (16-bit) values */
|
||||
uint16_t uw[4]; /* 4 Unsigned Word */
|
||||
int8_t b[8]; /* 8 Byte (8-bit) values */
|
||||
uint8_t ub[8]; /* 8 Unsigned Byte */
|
||||
float s[2]; /* Single-precision (32-bit) value */
|
||||
} mmx_t; /* NOTE: must be on an 8-byte (64-bit) boundary */
|
||||
|
||||
|
||||
#define mmx_i2r(op,imm,reg) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "i" (imm) \
|
||||
: #reg)
|
||||
|
||||
#define mmx_m2r(op,mem,reg) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "m" (mem) \
|
||||
: #reg)
|
||||
|
||||
#define mmx_r2m(op,reg,mem) \
|
||||
__asm__ __volatile__ (#op " %%" #reg ", %0" \
|
||||
: "=m" (mem) \
|
||||
: /* nothing */ \
|
||||
: "memory")
|
||||
|
||||
#define mmx_r2r(op,regs,regd) \
|
||||
__asm__ __volatile__ (#op " %%" #regs ", %%" #regd ::: #regd)
|
||||
|
||||
|
||||
#define emms() __asm__ __volatile__ ("emms")
|
||||
|
||||
#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
|
||||
#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
|
||||
#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
|
||||
|
||||
#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
|
||||
#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
|
||||
#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
|
||||
|
||||
#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
|
||||
#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
|
||||
#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
|
||||
#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
|
||||
|
||||
#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
|
||||
#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
|
||||
|
||||
#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
|
||||
#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
|
||||
#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
|
||||
#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
|
||||
#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
|
||||
#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
|
||||
|
||||
#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
|
||||
#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
|
||||
#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
|
||||
#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
|
||||
|
||||
#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
|
||||
#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
|
||||
#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
|
||||
#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
|
||||
|
||||
#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
|
||||
#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
|
||||
|
||||
#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
|
||||
#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
|
||||
|
||||
#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
|
||||
#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
|
||||
#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
|
||||
#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
|
||||
#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
|
||||
#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
|
||||
|
||||
#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
|
||||
#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
|
||||
#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
|
||||
#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
|
||||
#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
|
||||
#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
|
||||
|
||||
#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
|
||||
#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
|
||||
|
||||
#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
|
||||
#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
|
||||
|
||||
#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
|
||||
#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
|
||||
|
||||
#define por_m2r(var,reg) mmx_m2r (por, var, reg)
|
||||
#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
|
||||
|
||||
#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
|
||||
#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
|
||||
#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
|
||||
#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
|
||||
#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
|
||||
#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
|
||||
#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
|
||||
#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
|
||||
#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
|
||||
|
||||
#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
|
||||
#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
|
||||
#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
|
||||
#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
|
||||
#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
|
||||
#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
|
||||
|
||||
#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
|
||||
#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
|
||||
#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
|
||||
#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
|
||||
#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
|
||||
#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
|
||||
#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
|
||||
#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
|
||||
#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
|
||||
|
||||
#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
|
||||
#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
|
||||
#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
|
||||
#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
|
||||
#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
|
||||
#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
|
||||
|
||||
#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
|
||||
#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
|
||||
#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
|
||||
#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
|
||||
|
||||
#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
|
||||
#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
|
||||
#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
|
||||
#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
|
||||
|
||||
#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
|
||||
#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
|
||||
#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
|
||||
#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
|
||||
#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
|
||||
#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
|
||||
|
||||
#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
|
||||
#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
|
||||
#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
|
||||
#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
|
||||
#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
|
||||
#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
|
||||
|
||||
#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
|
||||
#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
|
||||
|
||||
|
||||
/* AMD MMX extensions - also available in intel SSE */
|
||||
|
||||
|
||||
#define mmx_m2ri(op,mem,reg,imm) \
|
||||
__asm__ __volatile__ (#op " %1, %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "X" (mem), "X" (imm) \
|
||||
: #reg)
|
||||
#define mmx_r2ri(op,regs,regd,imm) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
|
||||
: /* nothing */ \
|
||||
: "X" (imm) \
|
||||
: #regd)
|
||||
|
||||
#define mmx_fetch(mem,hint) \
|
||||
__asm__ __volatile__ ("prefetch" #hint " %0" \
|
||||
: /* nothing */ \
|
||||
: "X" (mem))
|
||||
|
||||
|
||||
#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
|
||||
|
||||
#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
|
||||
|
||||
#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
|
||||
#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
|
||||
#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
|
||||
#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
|
||||
|
||||
#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
|
||||
|
||||
#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
|
||||
|
||||
#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
|
||||
#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
|
||||
|
||||
#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
|
||||
#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
|
||||
|
||||
#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
|
||||
#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
|
||||
|
||||
#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
|
||||
#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
|
||||
|
||||
#define pmovmskb(mmreg,reg) \
|
||||
__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg : : : #reg)
|
||||
|
||||
#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
|
||||
#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
|
||||
|
||||
#define prefetcht0(mem) mmx_fetch (mem, t0)
|
||||
#define prefetcht1(mem) mmx_fetch (mem, t1)
|
||||
#define prefetcht2(mem) mmx_fetch (mem, t2)
|
||||
#define prefetchnta(mem) mmx_fetch (mem, nta)
|
||||
|
||||
#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
|
||||
#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
|
||||
|
||||
#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
|
||||
#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
|
||||
|
||||
#define sfence() __asm__ __volatile__ ("sfence\n\t")
|
@ -97,6 +97,3 @@ static void yadif_filter_line_c_16bit(uint8_t *dst8, uint8_t *prev8, uint8_t *cu
|
||||
void vlcpriv_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
|
||||
void vlcpriv_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
|
||||
#endif
|
||||
#if defined(__i386__)
|
||||
void vlcpriv_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
|
||||
#endif
|
||||
|
@ -248,9 +248,6 @@ cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
|
||||
FILTER 0, curq, nextq
|
||||
|
||||
.ret:
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
RET
|
||||
%if ARCH_X86_32
|
||||
%undef pb_1
|
||||
@ -262,7 +259,3 @@ INIT_XMM ssse3
|
||||
YADIF
|
||||
INIT_XMM sse2
|
||||
YADIF
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
YADIF
|
||||
%endif
|
||||
|
Loading…
Reference in New Issue
Block a user