x86/swr: add ff_resample_{common, linear}_float_fma

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Almer 2014-06-30 13:06:01 -03:00 committed by Michael Niedermayer
parent a441a2437b
commit 1a69224f44
2 changed files with 51 additions and 35 deletions

View File

@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
%if cpuflag(avx)
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
%ifidn %1, int16
%if mmsize == 16
pshufd m1, m0, q0032
@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
movd [dstq], m0
%else ; float/double
; horizontal sum & store
%if mmsize == 32
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
paddd m2, m3
paddd m0, m1
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
%if cpuflag(avx)
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
addps xm2, xm3
%endif
%ifidn %1, int16
%if mmsize == 16
pshufd m3, m2, q0032
@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
; - unix64: eax=r6[filter1], edx=r2[todo]
%else ; float/double
; val += (v2 - val) * (FELEML) frac / c->src_incr;
%if mmsize == 32
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
addps xm2, xm3
%endif
cvtsi2s%4 xm1, fracd
subp%4 xm2, xm0
mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 xm0, xm2, xm1, xm0
%else
mulp%4 xm2, xm1
addp%4 xm0, xm2
%endif ; cpuflag
; horizontal sum & store
movhlps xm1, xm0
@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if ARCH_X86_32
INIT_MMX mmxext

View File

@ -27,30 +27,19 @@
#include "libswresample/resample.h"
int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
#define RESAMPLE_FUNCS(type, opt) \
int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
const uint8_t *src, int sz, int upd); \
int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
const uint8_t *src, int sz, int upd)
int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
RESAMPLE_FUNCS(int16, mmxext);
RESAMPLE_FUNCS(int16, sse2);
RESAMPLE_FUNCS(float, sse);
RESAMPLE_FUNCS(float, avx);
RESAMPLE_FUNCS(float, fma3);
RESAMPLE_FUNCS(float, fma4);
RESAMPLE_FUNCS(double, sse2);
void swresample_dsp_x86_init(ResampleContext *c)
{
@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
}
if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
}
if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
}
}