diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm index 5b241220d8..8e7e4591b4 100644 --- a/libavfilter/x86/vf_v360.asm +++ b/libavfilter/x86/vf_v360.asm @@ -165,6 +165,49 @@ DEFINE_ARGS dst, width, src, x, u, v, ker %if ARCH_X86_64 +INIT_YMM avx2 +cglobal remap3_8bit_line, 7, 11, 8, dst, width, src, in_linesize, u, v, ker, x, y, tmp, z + movsxdifnidn widthq, widthd + xor zq, zq + xor yq, yq + xor xq, xq + movd xm0, in_linesized + pcmpeqw m7, m7 + vpbroadcastd m0, xm0 + vpbroadcastd m6, [pd_255] + + .loop: + pmovsxwd m1, [kerq + yq] + pmovsxwd m2, [vq + yq] + pmovsxwd m3, [uq + yq] + + pmulld m4, m2, m0 + paddd m4, m3 + mova m3, m7 + vpgatherdd m2, [srcq + m4], m3 + pand m2, m6 + pmulld m2, m1 + HADDD m2, m1 + movzx tmpq, word [vq + yq + 16] + imul tmpq, in_linesizeq + movzx zq, word [uq + yq + 16] + add tmpq, zq + movzx zq, byte [srcq + tmpq] + movzx tmpq, word [kerq + yq + 16] + imul zd, tmpd + movd xm1, zd + paddd m2, m1 + psrld m2, m2, 0xe + + packuswb m2, m2 + pextrb [dstq+xq], xm2, 0 + + add xq, 1 + add yq, 18 + cmp xq, widthq + jl .loop + RET + INIT_YMM avx2 cglobal remap4_8bit_line, 7, 9, 11, dst, width, src, in_linesize, u, v, ker, x, y movsxdifnidn widthq, widthd diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c index babc6c426a..5b1decd777 100644 --- a/libavfilter/x86/vf_v360_init.c +++ b/libavfilter/x86/vf_v360_init.c @@ -29,6 +29,9 @@ void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdi void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, const int16_t *const u, const int16_t *const v, const int16_t *const ker); +void ff_remap3_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, + const int16_t *const u, const int16_t *const v, const int16_t *const ker); + void ff_remap4_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize, const int16_t *const u, const int16_t *const v, const int16_t *const ker); @@ -55,6 +58,9 @@ av_cold void ff_v360_init_x86(V360Context *s, int depth) s->remap_line = ff_remap2_16bit_line_avx2; #if ARCH_X86_64 + if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == LAGRANGE9 && depth <= 8) + s->remap_line = ff_remap3_8bit_line_avx2; + if (EXTERNAL_AVX2_FAST(cpu_flags) && (s->interp == BICUBIC || s->interp == LANCZOS) && depth <= 8) s->remap_line = ff_remap4_8bit_line_avx2;