diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_threshold.asm index 56a6c242d8..098069b083 100644 --- a/libavfilter/x86/vf_threshold.asm +++ b/libavfilter/x86/vf_threshold.asm @@ -25,16 +25,18 @@ SECTION_RODATA pb_128: times 16 db 128 +pb_128_0 : times 8 db 0, 128 SECTION .text -%macro THRESHOLD_8 0 +;%1 depth (8 or 16) ; %2 b or w ; %3 constant +%macro THRESHOLD 3 %if ARCH_X86_64 -cglobal threshold8, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x +cglobal threshold%1, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x mov wd, dword wm mov hd, dword hm %else -cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x +cglobal threshold%1, 5, 7, 5, in, threshold, min, max, out, w, x mov wd, r10m %define ilinesizeq r5mp %define tlinesizeq r6mp @@ -43,7 +45,10 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x %define olinesizeq r9mp %define hd r11mp %endif - VBROADCASTI128 m4, [pb_128] + VBROADCASTI128 m4, [%3] +%if %1 == 16 + add wq, wq ; w *= 2 (16 bits instead of 8) +%endif add inq, wq add thresholdq, wq add minq, wq @@ -60,7 +65,7 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x movu m3, [maxq + xq] pxor m0, m4 pxor m1, m4 - pcmpgtb m0, m1 + pcmpgt%2 m0, m1 PBLENDVB m3, m2, m0 movu [outq + xq], m3 add xq, mmsize @@ -77,9 +82,11 @@ RET %endmacro INIT_XMM sse4 -THRESHOLD_8 +THRESHOLD 8, b, pb_128 +THRESHOLD 16, w, pb_128_0 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -THRESHOLD_8 +THRESHOLD 8, b, pb_128 +THRESHOLD 16, w, pb_128_0 %endif diff --git a/libavfilter/x86/vf_threshold_init.c b/libavfilter/x86/vf_threshold_init.c index db0559533d..8e42296791 100644 --- a/libavfilter/x86/vf_threshold_init.c +++ b/libavfilter/x86/vf_threshold_init.c @@ -23,20 +23,19 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/threshold.h" -void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold, - const uint8_t *min, const uint8_t *max, - uint8_t *out, - ptrdiff_t ilinesize, ptrdiff_t tlinesize, - ptrdiff_t flinesize, ptrdiff_t slinesize, - ptrdiff_t olinesize, - int w, int h); -void ff_threshold8_avx2(const uint8_t *in, const uint8_t *threshold, - const uint8_t *min, const uint8_t *max, - uint8_t *out, - ptrdiff_t ilinesize, ptrdiff_t tlinesize, - ptrdiff_t flinesize, ptrdiff_t slinesize, - ptrdiff_t olinesize, - int w, int h); +#define THRESHOLD_FUNC(depth, opt) \ +void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\ + const uint8_t *min, const uint8_t *max, \ + uint8_t *out, \ + ptrdiff_t ilinesize, ptrdiff_t tlinesize, \ + ptrdiff_t flinesize, ptrdiff_t slinesize, \ + ptrdiff_t olinesize, \ + int w, int h); + +THRESHOLD_FUNC(8, sse4) +THRESHOLD_FUNC(8, avx2) +THRESHOLD_FUNC(16, sse4) +THRESHOLD_FUNC(16, avx2) av_cold void ff_threshold_init_x86(ThresholdContext *s) { @@ -49,5 +48,12 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->threshold = ff_threshold8_avx2; } + } else if (s->depth == 16) { + if (EXTERNAL_SSE4(cpu_flags)) { + s->threshold = ff_threshold16_sse4; + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + s->threshold = ff_threshold16_avx2; + } } }