lavc/aarch64: motion estimation functions in neon

- ff_pix_abs16_neon
 - ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 3.

ff_pix_abs16_neon:
pix_abs_0_0_c: 141.1
pix_abs_0_0_neon: 19.6

ff_pix_abs16_xy2_neon:
pix_abs_0_3_c: 269.1
pix_abs_0_3_neon: 39.3

Tested with:
./tests/checkasm/checkasm --test=motion --bench --disable-linux-perf

Signed-off-by: Jonathan Swinney <jswinney@amazon.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Swinney, Jonathan 2022-06-26 20:58:09 +00:00 committed by Martin Storsjö
parent 20e2aa940c
commit c471cc7474
10 changed files with 407 additions and 1 deletions

View File

@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@ -47,6 +48,7 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o

View File

@ -0,0 +1,39 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/mpegvideo.h"
int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->pix_abs[0][0] = ff_pix_abs16_neon;
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
}
}

View File

@ -0,0 +1,205 @@
/*
* Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_pix_abs16_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
cmp w4, #4 // if h < 4, jump to completion section
movi v18.4S, #0 // clear result accumulator
b.lt 2f
1:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
ld1 {v1.16b}, [x1], x3 // load pix1
ld1 {v5.16b}, [x2], x3 // load pix2
uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
uabdl2 v17.8h, v0.16b, v4.16b
ld1 {v2.16b}, [x1], x3 // load pix1
ld1 {v6.16b}, [x2], x3 // load pix2
uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
uabal2 v17.8h, v1.16b, v5.16b
ld1 {v3.16b}, [x1], x3
ld1 {v7.16b}, [x2], x3
uabal v16.8h, v2.8b, v6.8b
uabal2 v17.8h, v2.16b, v6.16b
sub w4, w4, #4 // h -= 4
uabal v16.8h, v3.8b, v7.8b
uabal2 v17.8h, v3.16b, v7.16b
cmp w4, #4 // if h >= 4, loop
add v16.8h, v16.8h, v17.8h
uaddlv s16, v16.8h // add up everything in v16 accumulator
add d18, d16, d18 // add to the end result register
b.ge 1b
cbnz w4, 2f // if iterations remain, jump to completion section
fmov w0, s18 // copy result to general purpose register
ret
2:
ld1 {v0.16b}, [x1], x3 // load pix1
ld1 {v4.16b}, [x2], x3 // load pix2
uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
uabal2 v16.8h, v0.16b, v4.16b
subs w4, w4, #1 // h -= 1
addv h16, v16.8h // add up v16
add d18, d16, d18 // add to result
b.ne 2b
fmov w0, s18 // copy result to general purpose register
ret
endfunc
function ff_pix_abs16_xy2_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
add x5, x2, x3 // use x5 to hold uint8_t *pix3
movi v0.2d, #0 // initialize the result register
// Load initial pix2 values for either the unrolled version or completion version.
ldur q4, [x2, #1] // load pix2+1
ldr q3, [x2] // load pix2
uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7
uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15
cmp w4, #4 // if h < 4 jump to the completion version
b.lt 2f
1:
// This is an unrolled implementation. It completes 4 iterations of the C for each branch.
// In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
// plus two at the beginning to start.
ldur q5, [x5, #1] // load pix3+1
ld1 {v4.16b}, [x5], x3 // load pix3
ld1 {v1.16b}, [x1], x3 // load pix1
ldur q7, [x5, #1] // load pix3+1
ld1 {v6.16b}, [x5], x3 // load pix3
ld1 {v16.16b}, [x1], x3 // load pix1
ldur q19, [x5, #1] // load pix3+1
ld1 {v18.16b}, [x5], x3 // load pix3
ld1 {v17.16b}, [x1], x3 // load pix1
ldur q22, [x5, #1] // load pix3+1
ld1 {v21.16b}, [x5], x3 // load pix3
ld1 {v20.16b}, [x1], x3 // load pix1
// These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15
uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7
uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15
add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
// Averages are now stored in these registers:
// v23, v16, v28, v30
// pix1 values in these registers:
// v1, v16, v17, v20
// available:
// v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
sub w4, w4, #4 // h -= 4
// Using absolute-difference instructions instead of absolute-difference-accumulate allows
// us to keep the results in 16b vectors instead of widening values with twice the instructions.
// This approach also has fewer data dependencies, allowing better instruction level parallelism.
uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0
uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1
uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2
uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3
cmp w4, #4 // loop if h >= 4
// Now add up all the values in each vector, v4-v7 with widening adds
uaddl v19.8h, v4.8b, v5.8b
uaddl2 v18.8h, v4.16b, v5.16b
uaddl v4.8h, v6.8b, v7.8b
uaddl2 v5.8h, v6.16b, v7.16b
add v4.8h, v4.8h, v5.8h
add v4.8h, v4.8h, v18.8h
add v4.8h, v4.8h, v19.8h
uaddlv s4, v4.8h // finish adding up accumulated values
add d0, d0, d4 // add the value to the top level accumulator
b.ge 1b
cbnz w4, 2f // if iterations remain jump to completion section
fmov w0, s0 // copy result to general purpose register
ret
2:
// v2 and v3 are set either at the end of this loop or at from the unrolled version
// which branches here to complete iterations when h % 4 != 0.
ldur q5, [x5, #1] // load pix3+1
ld1 {v4.16b}, [x5], x3 // load pix3
ld1 {v1.16b}, [x1], x3 // load pix1
subs w4, w4, #1 // decrement h
uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
// divide by 4 to compute the average of values summed above
urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15
uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15
mov v2.16b, v18.16b // pix3 -> pix2
mov v3.16b, v19.16b // pix3+1 -> pix2+1
uaddlv s6, v6.8h // add up accumulator in v6
add d0, d0, d6 // add to the final result
b.ne 2b // loop if h > 0
fmov w0, s0 // copy result to general purpose register
ret
endfunc

View File

@ -1061,7 +1061,9 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_dsputil_init_dwt(c);
#endif
#if ARCH_ALPHA
#if ARCH_AARCH64
ff_me_cmp_init_aarch64(c, avctx);
#elif ARCH_ALPHA
ff_me_cmp_init_alpha(c, avctx);
#elif ARCH_ARM
ff_me_cmp_init_arm(c, avctx);

View File

@ -80,6 +80,7 @@ typedef struct MECmpContext {
} MECmpContext;
void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);

View File

@ -12,6 +12,7 @@ AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o
AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o
AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o
AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o

View File

@ -135,6 +135,9 @@ static const struct {
#if CONFIG_LLVIDENCDSP
{ "llviddspenc", checkasm_check_llviddspenc },
#endif
#if CONFIG_ME_CMP
{ "motion", checkasm_check_motion },
#endif
#if CONFIG_OPUS_DECODER
{ "opusdsp", checkasm_check_opusdsp },
#endif

View File

@ -68,6 +68,7 @@ void checkasm_check_idctdsp(void);
void checkasm_check_jpeg2000dsp(void);
void checkasm_check_llviddsp(void);
void checkasm_check_llviddspenc(void);
void checkasm_check_motion(void);
void checkasm_check_nlmeans(void);
void checkasm_check_opusdsp(void);
void checkasm_check_pixblockdsp(void);

151
tests/checkasm/motion.c Normal file
View File

@ -0,0 +1,151 @@
/*
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <string.h>
#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mem_internal.h"
#include "libavcodec/me_cmp.h"
#include "checkasm.h"
static void fill_random(uint8_t *tab, int size)
{
int i;
for (i = 0; i < size; i++) {
tab[i] = rnd() % 256;
}
}
static void test_motion(const char *name, me_cmp_func test_func)
{
/* test configurarion */
#define ITERATIONS 16
#define WIDTH 64
#define HEIGHT 64
/* motion estimation can look up to 17 bytes ahead */
static const int look_ahead = 17;
int i, x, y, d1, d2;
uint8_t *ptr;
LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]);
LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]);
declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c,
uint8_t *blk1 /* align width (8 or 16) */,
uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
int h);
if (test_func == NULL) {
return;
}
/* test correctness */
fill_random(img1, WIDTH * HEIGHT);
fill_random(img2, WIDTH * HEIGHT);
if (check_func(test_func, "%s", name)) {
for (i = 0; i < ITERATIONS; i++) {
x = rnd() % (WIDTH - look_ahead);
y = rnd() % (HEIGHT - look_ahead);
ptr = img2 + y * WIDTH + x;
d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
d1 = call_new(NULL, img1, ptr, WIDTH, 8);
if (d1 != d2) {
fail();
printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2);
break;
}
}
// benchmark with the final value of ptr
bench_new(NULL, img1, ptr, WIDTH, 8);
}
}
#define ME_CMP_1D_ARRAYS(XX) \
XX(sad) \
XX(sse) \
XX(hadamard8_diff) \
XX(vsad) \
XX(vsse) \
XX(nsse) \
XX(me_pre_cmp) \
XX(me_cmp) \
XX(me_sub_cmp) \
XX(mb_cmp) \
XX(ildct_cmp) \
XX(frame_skip_cmp) \
XX(median_sad)
// tests for functions not yet implemented
#if 0
XX(dct_sad) \
XX(quant_psnr) \
XX(bit) \
XX(rd) \
XX(w53) \
XX(w97) \
XX(dct_max) \
XX(dct264_sad) \
#endif
static void check_motion(void)
{
char buf[64];
AVCodecContext *av_ctx;
MECmpContext me_ctx;
memset(&me_ctx, 0, sizeof(me_ctx));
/* allocate AVCodecContext */
av_ctx = avcodec_alloc_context3(NULL);
av_ctx->flags |= AV_CODEC_FLAG_BITEXACT;
ff_me_cmp_init(&me_ctx, av_ctx);
for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) {
for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) {
snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
test_motion(buf, me_ctx.pix_abs[i][j]);
}
}
#define XX(me_cmp_array) \
for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) { \
snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i); \
test_motion(buf, me_ctx.me_cmp_array[i]); \
}
ME_CMP_1D_ARRAYS(XX)
#undef XX
avcodec_free_context(&av_ctx);
}
void checkasm_check_motion(void)
{
check_motion();
report("motion");
}

View File

@ -23,6 +23,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \
fate-checkasm-jpeg2000dsp \
fate-checkasm-llviddsp \
fate-checkasm-llviddspenc \
fate-checkasm-motion \
fate-checkasm-opusdsp \
fate-checkasm-pixblockdsp \
fate-checkasm-sbrdsp \