1
mirror of https://code.videolan.org/videolan/vlc synced 2024-07-25 09:41:30 +02:00

yadif : Add SSSE3 and SSE2 support. porting from FFmpeg.

Signed-off-by: Jean-Baptiste Kempf <jb@videolan.org>
This commit is contained in:
Naohiro KORIYAMA 2011-12-21 17:02:09 +09:00 committed by Jean-Baptiste Kempf
parent 8634f76199
commit 5c7c27cae5
3 changed files with 377 additions and 270 deletions

View File

@ -1,5 +1,5 @@
/*****************************************************************************
* algo_yadif.c : Wrapper for MPlayer's Yadif algorithm
* algo_yadif.c : Wrapper for FFmpeg's Yadif algorithm
*****************************************************************************
* Copyright (C) 2000-2011 the VideoLAN team
* $Id$
@ -26,10 +26,6 @@
# include "config.h"
#endif
#ifdef CAN_COMPILE_MMXEXT
# include "mmx.h"
#endif
#include <stdint.h>
#include <assert.h>
@ -47,23 +43,7 @@
* Yadif (Yet Another DeInterlacing Filter).
*****************************************************************************/
/* Yadif's private data struct */
struct vf_priv_s {
/*
* 0: Output 1 frame for each frame.
* 1: Output 1 frame for each field.
* 2: Like 0 but skips spatial interlacing check.
* 3: Like 1 but skips spatial interlacing check.
*
* In vlc, only & 0x02 has meaning, as we do the & 0x01 ourself.
*/
int mode;
};
/* I am unsure it is the right one */
typedef intptr_t x86_reg;
/* yadif.h comes from vf_yadif.c of mplayer project.
/* yadif.h comes from yadif.c of FFmpeg project.
Necessary preprocessor macros are defined in common.h. */
#include "yadif.h"
@ -125,15 +105,22 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
if( p_prev && p_cur && p_next )
{
/* */
void (*filter)(struct vf_priv_s *p, uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int refs, int parity);
void (*filter)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int prefs, int mrefs, int parity, int mode);
filter = yadif_filter_line_c;
#if defined(HAVE_YADIF_MMX)
if( vlc_CPU() & CPU_CAPABILITY_MMX )
filter = yadif_filter_line_mmx;
#endif
#if defined(HAVE_YADIF_SSE2)
if( vlc_CPU() & CPU_CAPABILITY_SSE2 )
filter = yadif_filter_line_mmx2;
else
filter = yadif_filter_line_sse2;
#endif
#if defined(HAVE_YADIF_SSSE3)
if( vlc_CPU() & CPU_CAPABILITY_SSSE3 )
filter = yadif_filter_line_ssse3;
#endif
filter = yadif_filter_line_c;
for( int n = 0; n < p_dst->i_planes; n++ )
{
@ -151,19 +138,20 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
}
else
{
struct vf_priv_s cfg;
int mode;
/* Spatial checks only when enough data */
cfg.mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
mode = (y >= 2 && y < dstp->i_visible_lines - 2) ? 0 : 2;
assert( prevp->i_pitch == curp->i_pitch && curp->i_pitch == nextp->i_pitch );
filter( &cfg,
&dstp->p_pixels[y * dstp->i_pitch],
filter( &dstp->p_pixels[y * dstp->i_pitch],
&prevp->p_pixels[y * prevp->i_pitch],
&curp->p_pixels[y * curp->i_pitch],
&nextp->p_pixels[y * nextp->i_pitch],
dstp->i_visible_pitch,
curp->i_pitch,
yadif_parity );
y < dstp->i_visible_lines - 2 ? curp->i_pitch : -curp->i_pitch,
y - 1 ? -curp->i_pitch : curp->i_pitch,
yadif_parity,
mode );
}
/* We duplicate the first and last lines */

View File

@ -1,274 +1,118 @@
/*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of MPlayer.
* This file is part of FFmpeg.
*
* MPlayer is free software; you can redistribute it and/or modify
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* MPlayer is distributed in the hope that it will be useful,
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with MPlayer; if not, write to the Free Software Foundation, Inc.,
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/* */
#if defined(CAN_COMPILE_SSE2) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ > 0))
#define HAVE_YADIF_SSE2
#define LOAD4(mem,dst) \
"movd "mem", "#dst" \n\t"\
"punpcklbw %%mm7, "#dst" \n\t"
#define PABS(tmp,dst) \
"pxor "#tmp", "#tmp" \n\t"\
"psubw "#dst", "#tmp" \n\t"\
"pmaxsw "#tmp", "#dst" \n\t"
#define CHECK(pj,mj) \
"movq "#pj"(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1+j] */\
"movq "#mj"(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1-j] */\
"movq %%mm2, %%mm4 \n\t"\
"movq %%mm2, %%mm5 \n\t"\
"pxor %%mm3, %%mm4 \n\t"\
"pavgb %%mm3, %%mm5 \n\t"\
"pand %[pb1], %%mm4 \n\t"\
"psubusb %%mm4, %%mm5 \n\t"\
"psrlq $8, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm2, %%mm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
"psrlq $8, %%mm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
"psrlq $16, %%mm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm4, %%mm2 \n\t" /* score */
#define CHECK1 \
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t" /* if(score < spatial_score) */\
"pminsw %%mm2, %%mm0 \n\t" /* spatial_score= score; */\
"movq %%mm3, %%mm6 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw %[pw1], %%mm6 \n\t"\
"psllw $14, %%mm6 \n\t"\
"paddsw %%mm6, %%mm2 \n\t"\
"movq %%mm0, %%mm3 \n\t"\
"pcmpgtw %%mm2, %%mm3 \n\t"\
"pminsw %%mm2, %%mm0 \n\t"\
"pand %%mm3, %%mm5 \n\t"\
"pandn %%mm1, %%mm3 \n\t"\
"por %%mm5, %%mm3 \n\t"\
"movq %%mm3, %%mm1 \n\t"
static void yadif_filter_line_mmx2(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
static const uint64_t pw_1 = 0x0001000100010001ULL;
static const uint64_t pb_1 = 0x0101010101010101ULL;
const int mode = p->mode;
uint64_t tmp0, tmp1, tmp2, tmp3;
int x;
#define FILTER\
for(x=0; x<w; x+=4){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
LOAD4("(%[cur],%[mrefs])", %%mm0) /* c = cur[x-refs] */\
LOAD4("(%[cur],%[prefs])", %%mm1) /* e = cur[x+refs] */\
LOAD4("(%["prev2"])", %%mm2) /* prev2[x] */\
LOAD4("(%["next2"])", %%mm3) /* next2[x] */\
"movq %%mm3, %%mm4 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"psraw $1, %%mm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
"movq %%mm0, %[tmp0] \n\t" /* c */\
"movq %%mm3, %[tmp1] \n\t" /* d */\
"movq %%mm1, %[tmp2] \n\t" /* e */\
"psubw %%mm4, %%mm2 \n\t"\
PABS( %%mm4, %%mm2) /* temporal_diff0 */\
LOAD4("(%[prev],%[mrefs])", %%mm3) /* prev[x-refs] */\
LOAD4("(%[prev],%[prefs])", %%mm4) /* prev[x+refs] */\
"psubw %%mm0, %%mm3 \n\t"\
"psubw %%mm1, %%mm4 \n\t"\
PABS( %%mm5, %%mm3)\
PABS( %%mm5, %%mm4)\
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff1 */\
"psrlw $1, %%mm2 \n\t"\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
LOAD4("(%[next],%[mrefs])", %%mm3) /* next[x-refs] */\
LOAD4("(%[next],%[prefs])", %%mm4) /* next[x+refs] */\
"psubw %%mm0, %%mm3 \n\t"\
"psubw %%mm1, %%mm4 \n\t"\
PABS( %%mm5, %%mm3)\
PABS( %%mm5, %%mm4)\
"paddw %%mm4, %%mm3 \n\t" /* temporal_diff2 */\
"psrlw $1, %%mm3 \n\t"\
"pmaxsw %%mm3, %%mm2 \n\t"\
"movq %%mm2, %[tmp3] \n\t" /* diff */\
\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm0, %%mm0 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psrlw $1, %%mm1 \n\t" /* spatial_pred */\
PABS( %%mm2, %%mm0) /* ABS(c-e) */\
\
"movq -1(%[cur],%[mrefs]), %%mm2 \n\t" /* cur[x-refs-1] */\
"movq -1(%[cur],%[prefs]), %%mm3 \n\t" /* cur[x+refs-1] */\
"movq %%mm2, %%mm4 \n\t"\
"psubusb %%mm3, %%mm2 \n\t"\
"psubusb %%mm4, %%mm3 \n\t"\
"pmaxub %%mm3, %%mm2 \n\t"\
"pshufw $9,%%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw %%mm7, %%mm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psubw %[pw1], %%mm0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
"movq %[tmp3], %%mm6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD4("(%["prev2"],%[mrefs],2)", %%mm2) /* prev2[x-2*refs] */\
LOAD4("(%["next2"],%[mrefs],2)", %%mm4) /* next2[x-2*refs] */\
LOAD4("(%["prev2"],%[prefs],2)", %%mm3) /* prev2[x+2*refs] */\
LOAD4("(%["next2"],%[prefs],2)", %%mm5) /* next2[x+2*refs] */\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psrlw $1, %%mm2 \n\t" /* b */\
"psrlw $1, %%mm3 \n\t" /* f */\
"movq %[tmp0], %%mm4 \n\t" /* c */\
"movq %[tmp1], %%mm5 \n\t" /* d */\
"movq %[tmp2], %%mm7 \n\t" /* e */\
"psubw %%mm4, %%mm2 \n\t" /* b-c */\
"psubw %%mm7, %%mm3 \n\t" /* f-e */\
"movq %%mm5, %%mm0 \n\t"\
"psubw %%mm4, %%mm5 \n\t" /* d-c */\
"psubw %%mm7, %%mm0 \n\t" /* d-e */\
"movq %%mm2, %%mm4 \n\t"\
"pminsw %%mm3, %%mm2 \n\t"\
"pmaxsw %%mm4, %%mm3 \n\t"\
"pmaxsw %%mm5, %%mm2 \n\t"\
"pminsw %%mm5, %%mm3 \n\t"\
"pmaxsw %%mm0, %%mm2 \n\t" /* max */\
"pminsw %%mm0, %%mm3 \n\t" /* min */\
"pxor %%mm4, %%mm4 \n\t"\
"pmaxsw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm4 \n\t" /* -max */\
"pmaxsw %%mm4, %%mm6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
"movq %[tmp1], %%mm2 \n\t" /* d */\
"movq %%mm2, %%mm3 \n\t"\
"psubw %%mm6, %%mm2 \n\t" /* d-diff */\
"paddw %%mm6, %%mm3 \n\t" /* d+diff */\
"pmaxsw %%mm2, %%mm1 \n\t"\
"pminsw %%mm3, %%mm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb %%mm1, %%mm1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)refs),\
[mrefs]"r"((x86_reg)-refs),\
[pw1] "m"(pw_1),\
[pb1] "m"(pb_1),\
[mode] "g"(mode)\
);\
__asm__ volatile("movd %%mm1, %0" :"=m"(*dst));\
dst += 4;\
prev+= 4;\
cur += 4;\
next+= 4;\
}
if(parity){
#define prev2 "prev"
#define next2 "cur"
FILTER
#undef prev2
#undef next2
}else{
#define prev2 "cur"
#define next2 "next"
FILTER
#undef prev2
#undef next2
}
}
#undef LOAD4
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
static void yadif_filter_line_c(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
#if defined(__GNUC__)
# define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v
# if VLC_GCC_VERSION(3,1)
# define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v
# else
# define DECLARE_ASM_CONST(n,t,v) static const t __attribute__ ((aligned (n))) v
# endif
#endif
typedef intptr_t x86_reg;
typedef struct { uint64_t a, b; } xmm_reg;
DECLARE_ASM_CONST(16, const xmm_reg, pb_1) = {0x0101010101010101ULL, 0x0101010101010101ULL};
DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x0001000100010001ULL};
#ifdef CAN_COMPILE_SSSE3
#if defined(__SSE__) || VLC_GCC_VERSION(4, 4)
// ================ SSSE3 =================
#define HAVE_YADIF_SSSE3
#define COMPILE_TEMPLATE_SSE 1
#define COMPILE_TEMPLATE_SSSE3 1
#define VLC_TARGET VLC_SSE
#define RENAME(a) a ## _ssse3
#include "yadif_template.h"
#undef COMPILE_TEMPLATE_SSE
#undef COMPILE_TEMPLATE_SSSE3
#undef VLC_TARGET
#undef RENAME
#endif
#endif
#ifdef CAN_COMPILE_SSE2
#if defined(__SSE__) || VLC_GCC_VERSION(4, 4)
// ================= SSE2 =================
#define HAVE_YADIF_SSE2
#define COMPILE_TEMPLATE_SSE 1
#define VLC_TARGET VLC_SSE
#define RENAME(a) a ## _sse2
#include "yadif_template.h"
#undef COMPILE_TEMPLATE_SSE
#undef VLC_TARGET
#undef RENAME
#endif
#endif
#ifdef CAN_COMPILE_MMX
#if defined(__MMX__) || VLC_GCC_VERSION(4, 4)
// ================ MMX =================
#define HAVE_YADIF_MMX
#define VLC_TARGET VLC_MMX
#define RENAME(a) a ## _mmx
#include "yadif_template.h"
#undef VLC_TARGET
#undef RENAME
#endif
#endif
static void yadif_filter_line_c(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) {
int x;
uint8_t *prev2= parity ? prev : cur ;
uint8_t *next2= parity ? cur : next;
for(x=0; x<w; x++){
int c= cur[-refs];
int c= cur[mrefs];
int d= (prev2[0] + next2[0])>>1;
int e= cur[+refs];
int e= cur[prefs];
int temporal_diff0= FFABS(prev2[0] - next2[0]);
int temporal_diff1=( FFABS(prev[-refs] - c) + FFABS(prev[+refs] - e) )>>1;
int temporal_diff2=( FFABS(next[-refs] - c) + FFABS(next[+refs] - e) )>>1;
int temporal_diff1=( FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e) )>>1;
int temporal_diff2=( FFABS(next[mrefs] - c) + FFABS(next[prefs] - e) )>>1;
int diff= FFMAX3(temporal_diff0>>1, temporal_diff1, temporal_diff2);
int spatial_pred= (c+e)>>1;
int spatial_score= FFABS(cur[-refs-1] - cur[+refs-1]) + FFABS(c-e)
+ FFABS(cur[-refs+1] - cur[+refs+1]) - 1;
int spatial_score= FFABS(cur[mrefs-1] - cur[prefs-1]) + FFABS(c-e)
+ FFABS(cur[mrefs+1] - cur[prefs+1]) - 1;
#define CHECK(j)\
{ int score= FFABS(cur[-refs-1+j] - cur[+refs-1-j])\
+ FFABS(cur[-refs +j] - cur[+refs -j])\
+ FFABS(cur[-refs+1+j] - cur[+refs+1-j]);\
{ int score= FFABS(cur[mrefs-1+j] - cur[prefs-1-j])\
+ FFABS(cur[mrefs +j] - cur[prefs -j])\
+ FFABS(cur[mrefs+1+j] - cur[prefs+1-j]);\
if(score < spatial_score){\
spatial_score= score;\
spatial_pred= (cur[-refs +j] + cur[+refs -j])>>1;\
spatial_pred= (cur[mrefs +j] + cur[prefs -j])>>1;\
CHECK(-1) CHECK(-2) }} }}
CHECK( 1) CHECK( 2) }} }}
if(p->mode<2){
int b= (prev2[-2*refs] + next2[-2*refs])>>1;
int f= (prev2[+2*refs] + next2[+2*refs])>>1;
if(mode<2){
int b= (prev2[2*mrefs] + next2[2*mrefs])>>1;
int f= (prev2[2*prefs] + next2[2*prefs])>>1;
#if 0
int a= cur[-3*refs];
int g= cur[+3*refs];
int a= cur[3*mrefs];
int g= cur[3*prefs];
int max= FFMAX3(d-e, d-c, FFMIN3(FFMAX(b-c,f-e),FFMAX(b-c,b-a),FFMAX(f-g,f-e)) );
int min= FFMIN3(d-e, d-c, FFMAX3(FFMIN(b-c,f-e),FFMIN(b-c,b-a),FFMIN(f-g,f-e)) );
#else

View File

@ -0,0 +1,275 @@
/*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifdef COMPILE_TEMPLATE_SSE
#define REGMM "xmm"
#define MM "%%"REGMM
#define MOV "movq"
#define MOVQ "movdqa"
#define MOVQU "movdqu"
#define STEP 8
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrldq $1, "reg" \n\t"
#define PSRL2(reg) "psrldq $2, "reg" \n\t"
#define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
"psrldq $2, "src" \n\t"
#else
#define REGMM "mm"
#define MM "%%"REGMM
#define MOV "movd"
#define MOVQ "movq"
#define MOVQU "movq"
#define STEP 4
#define LOAD(mem,dst) \
MOV" "mem", "dst" \n\t"\
"punpcklbw "MM"7, "dst" \n\t"
#define PSRL1(reg) "psrlq $8, "reg" \n\t"
#define PSRL2(reg) "psrlq $16, "reg" \n\t"
#define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
#endif
#ifdef COMPILE_TEMPLATE_SSSE3
#define PABS(tmp,dst) \
"pabsw "dst", "dst" \n\t"
#else
#define PABS(tmp,dst) \
"pxor "tmp", "tmp" \n\t"\
"psubw "dst", "tmp" \n\t"\
"pmaxsw "tmp", "dst" \n\t"
#endif
#define CHECK(pj,mj) \
MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
MOVQ" "MM"2, "MM"4 \n\t"\
MOVQ" "MM"2, "MM"5 \n\t"\
"pxor "MM"3, "MM"4 \n\t"\
"pavgb "MM"3, "MM"5 \n\t"\
"pand %[pb_1], "MM"4 \n\t"\
"psubusb "MM"4, "MM"5 \n\t"\
PSRL1(MM"5") \
"punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, "MM"3 \n\t"\
MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
"punpcklbw "MM"7, "MM"2 \n\t"\
"punpcklbw "MM"7, "MM"3 \n\t"\
"punpcklbw "MM"7, "MM"4 \n\t"\
"paddw "MM"3, "MM"2 \n\t"\
"paddw "MM"4, "MM"2 \n\t" /* score */
#define CHECK1 \
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
"pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
MOVQ" "MM"3, "MM"6 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
hurts both quality and speed, but matches the C version. */\
"paddw %[pw_1], "MM"6 \n\t"\
"psllw $14, "MM"6 \n\t"\
"paddsw "MM"6, "MM"2 \n\t"\
MOVQ" "MM"0, "MM"3 \n\t"\
"pcmpgtw "MM"2, "MM"3 \n\t"\
"pminsw "MM"2, "MM"0 \n\t"\
"pand "MM"3, "MM"5 \n\t"\
"pandn "MM"1, "MM"3 \n\t"\
"por "MM"5, "MM"3 \n\t"\
MOVQ" "MM"3, "MM"1 \n\t"
VLC_TARGET static void RENAME(yadif_filter_line)(uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int prefs, int mrefs, int parity, int mode)
{
DECLARE_ALIGNED(16, uint8_t, tmp0)[16];
DECLARE_ALIGNED(16, uint8_t, tmp1)[16];
DECLARE_ALIGNED(16, uint8_t, tmp2)[16];
DECLARE_ALIGNED(16, uint8_t, tmp3)[16];
int x;
#define FILTER\
for(x=0; x<w; x+=STEP){\
__asm__ volatile(\
"pxor "MM"7, "MM"7 \n\t"\
LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
LOAD("(%["next2"])", MM"3") /* next2[x] */\
MOVQ" "MM"3, "MM"4 \n\t"\
"paddw "MM"2, "MM"3 \n\t"\
"psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
MOVQ" "MM"0, %[tmp0] \n\t" /* c */\
MOVQ" "MM"3, %[tmp1] \n\t" /* d */\
MOVQ" "MM"1, %[tmp2] \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t"\
PABS( MM"4", MM"2") /* temporal_diff0 */\
LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
"psrlw $1, "MM"2 \n\t"\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
"psubw "MM"0, "MM"3 \n\t"\
"psubw "MM"1, "MM"4 \n\t"\
PABS( MM"5", MM"3")\
PABS( MM"5", MM"4")\
"paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
"psrlw $1, "MM"3 \n\t"\
"pmaxsw "MM"3, "MM"2 \n\t"\
MOVQ" "MM"2, %[tmp3] \n\t" /* diff */\
\
"paddw "MM"0, "MM"1 \n\t"\
"paddw "MM"0, "MM"0 \n\t"\
"psubw "MM"1, "MM"0 \n\t"\
"psrlw $1, "MM"1 \n\t" /* spatial_pred */\
PABS( MM"2", MM"0") /* ABS(c-e) */\
\
MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
MOVQ" "MM"2, "MM"4 \n\t"\
"psubusb "MM"3, "MM"2 \n\t"\
"psubusb "MM"4, "MM"3 \n\t"\
"pmaxub "MM"3, "MM"2 \n\t"\
PSHUF(MM"3", MM"2") \
"punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
"punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
"paddw "MM"2, "MM"0 \n\t"\
"paddw "MM"3, "MM"0 \n\t"\
"psubw %[pw_1], "MM"0 \n\t" /* spatial_score */\
\
CHECK(-2,0)\
CHECK1\
CHECK(-3,1)\
CHECK2\
CHECK(0,-2)\
CHECK1\
CHECK(1,-3)\
CHECK2\
\
/* if(p->mode<2) ... */\
MOVQ" %[tmp3], "MM"6 \n\t" /* diff */\
"cmpl $2, %[mode] \n\t"\
"jge 1f \n\t"\
LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
"paddw "MM"4, "MM"2 \n\t"\
"paddw "MM"5, "MM"3 \n\t"\
"psrlw $1, "MM"2 \n\t" /* b */\
"psrlw $1, "MM"3 \n\t" /* f */\
MOVQ" %[tmp0], "MM"4 \n\t" /* c */\
MOVQ" %[tmp1], "MM"5 \n\t" /* d */\
MOVQ" %[tmp2], "MM"7 \n\t" /* e */\
"psubw "MM"4, "MM"2 \n\t" /* b-c */\
"psubw "MM"7, "MM"3 \n\t" /* f-e */\
MOVQ" "MM"5, "MM"0 \n\t"\
"psubw "MM"4, "MM"5 \n\t" /* d-c */\
"psubw "MM"7, "MM"0 \n\t" /* d-e */\
MOVQ" "MM"2, "MM"4 \n\t"\
"pminsw "MM"3, "MM"2 \n\t"\
"pmaxsw "MM"4, "MM"3 \n\t"\
"pmaxsw "MM"5, "MM"2 \n\t"\
"pminsw "MM"5, "MM"3 \n\t"\
"pmaxsw "MM"0, "MM"2 \n\t" /* max */\
"pminsw "MM"0, "MM"3 \n\t" /* min */\
"pxor "MM"4, "MM"4 \n\t"\
"pmaxsw "MM"3, "MM"6 \n\t"\
"psubw "MM"2, "MM"4 \n\t" /* -max */\
"pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
"1: \n\t"\
\
MOVQ" %[tmp1], "MM"2 \n\t" /* d */\
MOVQ" "MM"2, "MM"3 \n\t"\
"psubw "MM"6, "MM"2 \n\t" /* d-diff */\
"paddw "MM"6, "MM"3 \n\t" /* d+diff */\
"pmaxsw "MM"2, "MM"1 \n\t"\
"pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
"packuswb "MM"1, "MM"1 \n\t"\
\
:[tmp0]"=m"(tmp0),\
[tmp1]"=m"(tmp1),\
[tmp2]"=m"(tmp2),\
[tmp3]"=m"(tmp3)\
:[prev] "r"(prev),\
[cur] "r"(cur),\
[next] "r"(next),\
[prefs]"r"((x86_reg)prefs),\
[mrefs]"r"((x86_reg)mrefs),\
[pw_1] "m"(pw_1),\
[pb_1] "m"(pb_1),\
[mode] "g"(mode)\
:REGMM"0",REGMM"1",REGMM"2",REGMM"3",REGMM"4",REGMM"5",REGMM"6",REGMM"7"\
);\
__asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
dst += STEP;\
prev+= STEP;\
cur += STEP;\
next+= STEP;\
}
if (parity) {
#define prev2 "prev"
#define next2 "cur"
FILTER
#undef prev2
#undef next2
} else {
#define prev2 "cur"
#define next2 "next"
FILTER
#undef prev2
#undef next2
}
}
#undef STEP
#undef REGMM
#undef MM
#undef MOV
#undef MOVQ
#undef MOVQU
#undef PSHUF
#undef PSRL1
#undef PSRL2
#undef LOAD
#undef PABS
#undef CHECK
#undef CHECK1
#undef CHECK2
#undef FILTER