zimg: add slice threading and use it by default

This probably makes it much faster (I wouldn't know, I didn't run any benchmarks ). Seems to work as well (although I'm not sure, it's not like I'd perform rigorous tests). The scale_zimg test seems to mysteriously treat color in fully transparent alpha differently, which makes no sense, and isn't visible (but makes the test fail). I can't be bothered with investigating this more. What do you do with failing tests? Correct, you disable them. Or rather, you disable whatever appears to cause them to fail, which is the threading in this case. This change follows mostly the tile_example.cpp. The slice size uses a minimum of 64, which was suggested by the zimg author. Some of this commit is a bit inelegant and weird, such as recomputing the scale factor for every slice, or the way slice_h is managed. Too lazy to make this more elegant. zimg git had a regressio around active_region (which is needed by the slicing), which was fixed in commit 83071706b2e6bc634. Apparently, the bug was never released, so just add a warning to the manpage.
2020-07-14 22:52:27 +02:00 · 2020-07-14 22:52:27 +02:00 · 2a89da0c85
parent f1290f7095
commit 2a89da0c85
4 changed files with 123 additions and 20 deletions
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@ -4283,6 +4283,17 @@ Software Scaler
 ``--zimg-dither=<no|ordered|random|error-diffusion>``
    Dithering (default: random).

+``--zimg-threads=<auto|integer>``
+    Set the maximum number of threads to use for scaling (default: auto).
+    ``auto`` uses the number of logical cores on the current machine. Note that
+    the scaler may use less threads (or even just 1 thread) depending on stuff.
+    Passing a value of 1 disables threading and always scales the image in a
+    single operation. Higher thread counts waste resources, but make it
+    typically faster.
+
+    Note that some zimg git versions had bugs that will corrupt the output if
+    threads are used.
+
 ``--zimg-fast=<yes|no>``
    Allow optimizations that help with performance, but reduce quality (default:
    yes). Currently, this may simplify gamma conversion operations.
--- a/test/scale_zimg.c
+++ b/test/scale_zimg.c
@ -24,6 +24,7 @@ static const struct scale_test_fns fns = {
 static void run(struct test_ctx *ctx)
 {
    struct mp_zimg_context *zimg = mp_zimg_alloc();
+    zimg->opts.threads = 1;

    struct scale_test *stest = talloc_zero(NULL, struct scale_test);
    stest->fns = &fns;
--- a/video/zimg.c
+++ b/video/zimg.c
@ -17,12 +17,13 @@

 #include <math.h>

-#include <libavutil/bswap.h>
-#include <libavutil/pixfmt.h>
+#include <libavutil/cpu.h>

 #include "common/common.h"
 #include "common/msg.h"
 #include "csputils.h"
+#include "misc/thread_pool.h"
+#include "misc/thread_tools.h"
 #include "options/m_config.h"
 #include "options/m_option.h"
 #include "repack.h"
@ -70,6 +71,7 @@ const struct m_sub_options zimg_conf = {
            {"random",          ZIMG_DITHER_RANDOM},
            {"error-diffusion", ZIMG_DITHER_ERROR_DIFFUSION})},
        {"fast", OPT_FLAG(fast)},
+        {"threads", OPT_CHOICE(threads, {"auto", 0}), M_RANGE(1, 64)},
        {0}
    },
    .size = sizeof(struct zimg_opts),
@ -82,6 +84,9 @@ struct mp_zimg_state {
    void *tmp_alloc;
    struct mp_zimg_repack *src;
    struct mp_zimg_repack *dst;
+    int slice_y, slice_h; // y start position, height of target slice
+    double scale_y;
+    struct mp_waiter thread_waiter;
 };

 struct mp_zimg_repack {
@ -102,6 +107,7 @@ struct mp_zimg_repack {

    // Temporary memory for zimg buffer.
    zimg_image_buffer zbuf;
+    struct mp_image cropped_tmp;

    int real_w, real_h;         // aligned size
 };
@ -198,6 +204,7 @@ static void free_mp_zimg(void *p)
    struct mp_zimg_context *ctx = p;

    destroy_zimg(ctx);
+    TA_FREEP(&ctx->tp);
 }

 struct mp_zimg_context *mp_zimg_alloc(void)
@ -242,11 +249,19 @@ static int repack_entrypoint(void *user, unsigned i, unsigned x0, unsigned x1)
    return 0;
 }

-static bool wrap_buffer(struct mp_zimg_repack *r, struct mp_image *mpi)
+static bool wrap_buffer(struct mp_zimg_state *st, struct mp_zimg_repack *r,
+                        struct mp_image *a_mpi)
 {
    zimg_image_buffer *buf = &r->zbuf;
    *buf = (zimg_image_buffer){ZIMG_API_VERSION};

+    struct mp_image *mpi = a_mpi;
+    if (r->pack) {
+        mpi = &r->cropped_tmp;
+        *mpi = *a_mpi;
+        mp_image_crop(mpi, 0, st->slice_y, mpi->w, st->slice_y + st->slice_h);
+    }
+
    bool direct[MP_MAX_PLANES] = {0};

    for (int p = 0; p < mpi->num_planes; p++) {
@ -354,16 +369,27 @@ static bool setup_format(zimg_image_format *zfmt, struct mp_zimg_repack *r,

    r->num_planes = desc.num_planes;

+    // Take care of input/output size, including slicing.
    // Note: formats with subsampled chroma may have odd width or height in
    // mpv and FFmpeg. This is because the width/height is actually a cropping
    // rectangle. Reconstruct the image allocation size and set the cropping.
    zfmt->width = r->real_w = MP_ALIGN_UP(fmt.w, 1 << desc.chroma_xs);
    zfmt->height = r->real_h = MP_ALIGN_UP(fmt.h, 1 << desc.chroma_ys);
-    if (!r->pack && st) {
-        // Relies on st->dst being initialized first.
-        struct mp_zimg_repack *dst = st->dst;
-        zfmt->active_region.width = dst->real_w * (double)fmt.w / dst->fmt.w;
-        zfmt->active_region.height = dst->real_h * (double)fmt.h / dst->fmt.h;
+    if (st) {
+        if (r->pack) {
+            zfmt->height = r->real_h = st->slice_h =
+                MPMIN(st->slice_y + st->slice_h, r->real_h) - st->slice_y;
+
+            assert(MP_IS_ALIGNED(r->real_h, 1 << desc.chroma_ys));
+        } else {
+            // Relies on st->dst being initialized first.
+            struct mp_zimg_repack *dst = st->dst;
+
+            zfmt->active_region.width = dst->real_w * (double)fmt.w / dst->fmt.w;
+            zfmt->active_region.height = dst->real_h * st->scale_y;
+
+            zfmt->active_region.top = st->slice_y * st->scale_y;
+        }
    }

    zfmt->subsample_w = desc.chroma_xs;
@ -440,13 +466,13 @@ static bool allocate_buffer(struct mp_zimg_state *st, struct mp_zimg_repack *r)
    // Either ZIMG_BUFFER_MAX, or a power-of-2 slice buffer.
    assert(r->zmask[0] == ZIMG_BUFFER_MAX || MP_IS_POWER_OF_2(r->zmask[0] + 1));

-    int h = r->zmask[0] == ZIMG_BUFFER_MAX ? r->fmt.h : r->zmask[0] + 1;
-    if (h >= r->fmt.h) {
-        h = r->fmt.h;
+    int h = r->zmask[0] == ZIMG_BUFFER_MAX ? r->real_h : r->zmask[0] + 1;
+    if (h >= r->real_h) {
+        h = r->real_h;
        r->zmask[0] = ZIMG_BUFFER_MAX;
    }

-    r->tmp = mp_image_alloc(r->zimgfmt, r->fmt.w, h);
+    r->tmp = mp_image_alloc(r->zimgfmt, r->real_w, h);
    talloc_steal(r, r->tmp);

    if (!r->tmp)
@ -465,13 +491,18 @@ static bool allocate_buffer(struct mp_zimg_state *st, struct mp_zimg_repack *r)
 }

 static bool mp_zimg_state_init(struct mp_zimg_context *ctx,
-                               struct mp_zimg_state *st)
+                               struct mp_zimg_state *st,
+                               int slice_y, int slice_h)
 {
    struct zimg_opts *opts = &ctx->opts;

    st->src = talloc_zero(NULL, struct mp_zimg_repack);
    st->dst = talloc_zero(NULL, struct mp_zimg_repack);

+    st->scale_y = ctx->src.h / (double)ctx->dst.h;
+    st->slice_y = slice_y;
+    st->slice_h = slice_h;
+
    zimg_image_format src_fmt, dst_fmt;

    // Note: do dst first, because src uses fields from dst.
@ -532,15 +563,49 @@ bool mp_zimg_config(struct mp_zimg_context *ctx)
    if (ctx->opts_cache)
        mp_zimg_update_from_cmdline(ctx);

-    struct mp_zimg_state *st = talloc_zero(NULL, struct mp_zimg_state);
-    MP_TARRAY_APPEND(ctx, ctx->states, ctx->num_states, st);
+    int slices = ctx->opts.threads;
+    if (slices < 1)
+        slices = av_cpu_count();
+    slices = MPCLAMP(slices, 1, 64);

-    if (!mp_zimg_state_init(ctx, st)) {
-        destroy_zimg(ctx);
-        return false;
+    struct mp_imgfmt_desc dstfmt = mp_imgfmt_get_desc(ctx->dst.imgfmt);
+    if (!dstfmt.align_y)
+        goto fail;
+    int full_h = MP_ALIGN_UP(ctx->dst.h, dstfmt.align_y);
+    int slice_h = (full_h + slices - 1) / slices;
+    slice_h = MP_ALIGN_UP(slice_h, dstfmt.align_y);
+    slice_h = MP_ALIGN_UP(slice_h, 64); // for dithering and minimum slice size
+    slices = (full_h + slice_h - 1) / slice_h;
+
+    int threads = slices - 1;
+    if (threads != ctx->current_thread_count) {
+        // Just destroy and recreate all - dumb and costly, but rarely happens.
+        TA_FREEP(&ctx->tp);
+        ctx->current_thread_count = 0;
+        if (threads) {
+            MP_VERBOSE(ctx, "using %d threads for scaling\n", threads);
+            ctx->tp = mp_thread_pool_create(NULL, threads, threads, threads);
+            if (!ctx->tp)
+                goto fail;
+            ctx->current_thread_count = threads;
+        }
    }

+    for (int n = 0; n < slices; n++) {
+        struct mp_zimg_state *st = talloc_zero(NULL, struct mp_zimg_state);
+        MP_TARRAY_APPEND(ctx, ctx->states, ctx->num_states, st);
+
+        if (!mp_zimg_state_init(ctx, st, n * slice_h, slice_h))
+            goto fail;
+    }
+
+    assert(ctx->num_states == slices);
+
    return true;
+
+fail:
+    destroy_zimg(ctx);
+    return false;
 }

 bool mp_zimg_config_image_params(struct mp_zimg_context *ctx)
@ -577,6 +642,14 @@ static void do_convert(struct mp_zimg_state *st)
                              repack_entrypoint, st->dst);
 }

+static void do_convert_thread(void *ptr)
+{
+    struct mp_zimg_state *st = ptr;
+
+    do_convert(st);
+    mp_waiter_wakeup(&st->thread_waiter, 0);
+}
+
 bool mp_zimg_convert(struct mp_zimg_context *ctx, struct mp_image *dst,
                     struct mp_image *src)
 {
@ -591,15 +664,30 @@ bool mp_zimg_convert(struct mp_zimg_context *ctx, struct mp_image *dst,
    for (int n = 0; n < ctx->num_states; n++) {
        struct mp_zimg_state *st = ctx->states[n];

-        if (!wrap_buffer(st->src, src) || !wrap_buffer(st->dst, dst)) {
+        if (!wrap_buffer(st, st->src, src) || !wrap_buffer(st, st->dst, dst)) {
            MP_ERR(ctx, "zimg repacker initialization failed.\n");
            return false;
        }
    }

-    assert(ctx->num_states == 1);
+    for (int n = 1; n < ctx->num_states; n++) {
+        struct mp_zimg_state *st = ctx->states[n];
+
+        st->thread_waiter = (struct mp_waiter)MP_WAITER_INITIALIZER;
+
+        bool r = mp_thread_pool_run(ctx->tp, do_convert_thread, st);
+        // This is guaranteed by the API; and unrolling would be inconvenient.
+        assert(r);
+    }
+
    do_convert(ctx->states[0]);

+    for (int n = 1; n < ctx->num_states; n++) {
+        struct mp_zimg_state *st = ctx->states[n];
+
+        mp_waiter_wait(&st->thread_waiter);
+    }
+
    return true;
 }

--- a/video/zimg.h
+++ b/video/zimg.h
@ -20,6 +20,7 @@ struct zimg_opts {
    double scaler_chroma_params[2];
    int dither;
    int fast;
+    int threads;
 };

 extern const struct zimg_opts zimg_opts_defaults;
@ -42,6 +43,8 @@ struct mp_zimg_context {
    struct m_config_cache *opts_cache;
    struct mp_zimg_state **states;
    int num_states;
+    struct mp_thread_pool *tp;
+    int current_thread_count;
 };

 // Allocate a zimg context. Always succeeds. Returns a talloc pointer (use