ffmpeg/libavfilter/vf_nlmeans_vulkan.c

1115 lines
49 KiB
C

/*
* Copyright (c) Lynne
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/random_seed.h"
#include "libavutil/opt.h"
#include "vulkan_filter.h"
#include "vulkan_spirv.h"
#include "internal.h"
#include "video.h"
#define TYPE_NAME "vec4"
#define TYPE_ELEMS 4
#define TYPE_SIZE (TYPE_ELEMS*4)
typedef struct NLMeansVulkanContext {
FFVulkanContext vkctx;
int initialized;
FFVkExecPool e;
FFVkQueueFamilyCtx qf;
VkSampler sampler;
AVBufferPool *integral_buf_pool;
AVBufferPool *ws_buf_pool;
FFVkBuffer xyoffsets_buf;
int pl_weights_rows;
FFVulkanPipeline pl_weights;
FFVkSPIRVShader shd_weights;
FFVulkanPipeline pl_denoise;
FFVkSPIRVShader shd_denoise;
int *xoffsets;
int *yoffsets;
int nb_offsets;
float strength[4];
int patch[4];
struct nlmeans_opts {
int r;
double s;
double sc[4];
int p;
int pc[4];
int t;
} opts;
} NLMeansVulkanContext;
extern const char *ff_source_prefix_sum_comp;
static void insert_first(FFVkSPIRVShader *shd, int r, const char *off, int horiz, int plane, int comp)
{
GLSLF(4, s1 = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i];
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i];
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i];
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i];
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i];
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
}
static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
{
GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
if (!first)
GLSLC(1, barrier(); );
GLSLC(0, );
GLSLF(1, if (pos.y < height[%i]) { ,plane);
GLSLC(2, #pragma unroll(1) );
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
GLSLC(3, prefix_sum = DTYPE(0); );
GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
GLSLC(0, );
GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
if (first)
insert_first(shd, 0, "r", 0, plane, comp);
else
GLSLC(4, s2 = dst.v[pos.x]; );
GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
GLSLC(4, prefix_sum += s2; );
GLSLC(3, } );
GLSLC(2, } );
GLSLC(1, } );
GLSLC(0, );
}
static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
{
GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
GLSLC(1, #pragma unroll(1) );
GLSLF(1, for (r = 0; r < %i; r++) ,nb_rows);
GLSLC(2, psum[r] = DTYPE(0); );
GLSLC(0, );
if (!first)
GLSLC(1, barrier(); );
GLSLC(0, );
GLSLF(1, if (pos.x < width[%i]) { ,plane);
GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
GLSLC(3, offset = int_stride * uint64_t(pos.y); );
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
GLSLC(0, );
GLSLC(3, #pragma unroll(1) );
GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
if (first)
insert_first(shd, 0, "r", 1, plane, comp);
else
GLSLC(4, s2 = dst.v[pos.x + r]; );
GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
GLSLC(4, psum[r] += s2; );
GLSLC(3, } );
GLSLC(2, } );
GLSLC(1, } );
GLSLC(0, );
}
static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
int t, int dst_comp, int plane, int comp)
{
GLSLF(1, p = patch_size[%i]; ,dst_comp);
GLSLC(0, );
GLSLC(1, barrier(); );
GLSLC(0, );
if (!vert) {
GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
GLSLC(3, break; );
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
} else {
GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
GLSLC(3, break; );
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
}
GLSLC(0, );
GLSLC(3, a = DTYPE(0); );
GLSLC(3, b = DTYPE(0); );
GLSLC(3, c = DTYPE(0); );
GLSLC(3, d = DTYPE(0); );
GLSLC(0, );
GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
GLSLC(0, );
GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
GLSLC(0, );
GLSLC(3, if (lt == false) { );
GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
GLSLC(4, a = dst.v[pos.x - p]; );
GLSLC(4, c = dst.v[pos.x + p]; );
GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
GLSLC(4, b = dst.v[pos.x - p]; );
GLSLC(4, d = dst.v[pos.x + p]; );
GLSLC(3, } );
GLSLC(0, );
GLSLC(3, patch_diff = d + a - b - c; );
GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
GLSLC(3, sum = dot(w, src*255); );
GLSLC(0, );
if (t > 1) {
GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
} else {
GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
}
GLSLC(2, } );
GLSLC(1, } );
}
typedef struct HorizontalPushData {
uint32_t width[4];
uint32_t height[4];
uint32_t ws_stride[4];
int32_t patch_size[4];
float strength[4];
VkDeviceAddress integral_base;
uint64_t integral_size;
uint64_t int_stride;
uint32_t xyoffs_start;
} HorizontalPushData;
static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
FFVulkanPipeline *pl, FFVkSPIRVShader *shd,
VkSampler sampler, FFVkSPIRVCompiler *spv,
int width, int height, int t,
const AVPixFmtDescriptor *desc,
int planes, int *nb_rows)
{
int err;
uint8_t *spv_data;
size_t spv_len;
void *spv_opaque = NULL;
FFVulkanDescriptorSetBinding *desc_set;
int max_dim = FFMAX(width, height);
uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
int wg_size, wg_rows;
/* Round the max workgroup size to the previous power of two */
wg_size = max_wg;
wg_rows = 1;
if (max_wg > max_dim) {
wg_size = max_dim;
} else if (max_wg < max_dim) {
/* Make it fit */
while (wg_size*wg_rows < max_dim)
wg_rows++;
}
RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0));
ff_vk_shader_set_compute_sizes(shd, wg_size, 1, 1);
*nb_rows = wg_rows;
if (t > 1)
GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
GLSLC(0, );
GLSLF(0, #define DTYPE %s ,TYPE_NAME);
GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
GLSLC(0, );
GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { );
GLSLC(1, DTYPE v[]; );
GLSLC(0, }; );
GLSLC(0, );
GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
GLSLC(1, uvec4 width; );
GLSLC(1, uvec4 height; );
GLSLC(1, uvec4 ws_stride; );
GLSLC(1, ivec4 patch_size; );
GLSLC(1, vec4 strength; );
GLSLC(1, DataBuffer integral_base; );
GLSLC(1, uint64_t integral_size; );
GLSLC(1, uint64_t int_stride; );
GLSLC(1, uint xyoffs_start; );
GLSLC(0, }; );
GLSLC(0, );
ff_vk_add_push_constant(pl, 0, sizeof(HorizontalPushData), VK_SHADER_STAGE_COMPUTE_BIT);
desc_set = (FFVulkanDescriptorSetBinding []) {
{
.name = "input_img",
.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
.dimensions = 2,
.elems = planes,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.samplers = DUP_SAMPLER(sampler),
},
{
.name = "weights_buffer_0",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_0[];",
},
{
.name = "sums_buffer_0",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_0[];",
},
{
.name = "weights_buffer_1",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_1[];",
},
{
.name = "sums_buffer_1",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_1[];",
},
{
.name = "weights_buffer_2",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_2[];",
},
{
.name = "sums_buffer_2",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_2[];",
},
{
.name = "weights_buffer_3",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_3[];",
},
{
.name = "sums_buffer_3",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_3[];",
},
};
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0));
desc_set = (FFVulkanDescriptorSetBinding []) {
{
.name = "xyoffsets_buffer",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "ivec2 xyoffsets[];",
},
};
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1, 1, 0));
GLSLC(0, );
GLSLC(0, void main() );
GLSLC(0, { );
GLSLC(1, uint64_t offset; );
GLSLC(1, DataBuffer dst; );
GLSLC(1, float s1; );
GLSLC(1, DTYPE s2; );
GLSLC(1, DTYPE prefix_sum; );
GLSLF(1, DTYPE psum[%i]; ,*nb_rows);
GLSLC(1, int r; );
GLSLC(1, ivec2 pos; );
GLSLC(1, int p; );
GLSLC(0, );
GLSLC(1, DataBuffer integral_data; );
GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS);
GLSLC(0, );
GLSLC(1, int invoc_idx = int(gl_WorkGroupID.z); );
GLSLC(0, );
GLSLC(1, offset = integral_size * invoc_idx; );
GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
for (int i = 0; i < TYPE_ELEMS; i++)
GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i]; ,i,TYPE_ELEMS,i);
GLSLC(0, );
GLSLC(1, DTYPE a; );
GLSLC(1, DTYPE b; );
GLSLC(1, DTYPE c; );
GLSLC(1, DTYPE d; );
GLSLC(0, );
GLSLC(1, DTYPE patch_diff; );
if (TYPE_ELEMS == 4) {
GLSLC(1, vec4 src; );
GLSLC(1, vec4 w; );
} else {
GLSLC(1, vec4 src[4]; );
GLSLC(1, vec4 w[4]; );
}
GLSLC(1, float w_sum; );
GLSLC(1, float sum; );
GLSLC(0, );
GLSLC(1, bool lt; );
GLSLC(1, bool gt; );
GLSLC(0, );
for (int i = 0; i < desc->nb_components; i++) {
int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
if (width >= height) {
insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off);
} else {
insert_vertical_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
insert_horizontal_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
insert_weights_pass(shd, *nb_rows, 1, t, i, desc->comp[i].plane, off);
}
}
GLSLC(0, } );
RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque));
RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main"));
RET(ff_vk_init_compute_pipeline(vkctx, pl, shd));
RET(ff_vk_exec_pipeline_register(vkctx, exec, pl));
fail:
if (spv_opaque)
spv->free_shader(spv, &spv_opaque);
return err;
}
typedef struct DenoisePushData {
uint32_t ws_stride[4];
} DenoisePushData;
static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
FFVulkanPipeline *pl, FFVkSPIRVShader *shd,
VkSampler sampler, FFVkSPIRVCompiler *spv,
const AVPixFmtDescriptor *desc, int planes)
{
int err;
uint8_t *spv_data;
size_t spv_len;
void *spv_opaque = NULL;
FFVulkanDescriptorSetBinding *desc_set;
RET(ff_vk_shader_init(pl, shd, "nlmeans_denoise",
VK_SHADER_STAGE_COMPUTE_BIT, 0));
ff_vk_shader_set_compute_sizes(shd, 32, 32, 1);
GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
GLSLC(1, uvec4 ws_stride; );
GLSLC(0, }; );
ff_vk_add_push_constant(pl, 0, sizeof(DenoisePushData), VK_SHADER_STAGE_COMPUTE_BIT);
desc_set = (FFVulkanDescriptorSetBinding []) {
{
.name = "input_img",
.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
.dimensions = 2,
.elems = planes,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.samplers = DUP_SAMPLER(sampler),
},
{
.name = "output_img",
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.mem_layout = ff_vk_shader_rep_fmt(vkctx->output_format),
.mem_quali = "writeonly",
.dimensions = 2,
.elems = planes,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
{
.name = "weights_buffer_0",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_0[];",
},
{
.name = "sums_buffer_0",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_0[];",
},
{
.name = "weights_buffer_1",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_1[];",
},
{
.name = "sums_buffer_1",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_1[];",
},
{
.name = "weights_buffer_2",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_2[];",
},
{
.name = "sums_buffer_2",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_2[];",
},
{
.name = "weights_buffer_3",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float weights_3[];",
},
{
.name = "sums_buffer_3",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.mem_quali = "readonly",
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.buf_content = "float sums_3[];",
},
};
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 2 + 2*desc->nb_components, 0, 0));
GLSLC(0, void main() );
GLSLC(0, { );
GLSLC(1, ivec2 size; );
GLSLC(1, const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); );
GLSLC(1, const uint plane = uint(gl_WorkGroupID.z); );
GLSLC(0, );
GLSLC(1, float w_sum; );
GLSLC(1, float sum; );
GLSLC(1, vec4 src; );
GLSLC(1, vec4 r; );
GLSLC(0, );
GLSLC(1, size = imageSize(output_img[plane]); );
GLSLC(1, if (!IS_WITHIN(pos, size)) );
GLSLC(2, return; );
GLSLC(0, );
GLSLC(1, src = texture(input_img[plane], pos); );
GLSLC(0, );
for (int c = 0; c < desc->nb_components; c++) {
int off = desc->comp[c].offset / (FFALIGN(desc->comp[c].depth, 8)/8);
GLSLF(1, if (plane == %i) { ,desc->comp[c].plane);
GLSLF(2, w_sum = weights_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c);
GLSLF(2, sum = sums_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c);
GLSLF(2, r[%i] = (sum + src[%i]*255) / (1.0 + w_sum) / 255; ,off, off);
GLSLC(1, } );
GLSLC(0, );
}
GLSLC(1, imageStore(output_img[plane], pos, r); );
GLSLC(0, } );
RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque));
RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main"));
RET(ff_vk_init_compute_pipeline(vkctx, pl, shd));
RET(ff_vk_exec_pipeline_register(vkctx, exec, pl));
fail:
if (spv_opaque)
spv->free_shader(spv, &spv_opaque);
return err;
}
static av_cold int init_filter(AVFilterContext *ctx)
{
int rad, err;
int xcnt = 0, ycnt = 0;
NLMeansVulkanContext *s = ctx->priv;
FFVulkanContext *vkctx = &s->vkctx;
const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
FFVkSPIRVCompiler *spv;
int *offsets_buf;
int offsets_dispatched = 0, nb_dispatches = 0;
const AVPixFmtDescriptor *desc;
desc = av_pix_fmt_desc_get(vkctx->output_format);
if (!desc)
return AVERROR(EINVAL);
if (!(s->opts.r & 1)) {
s->opts.r |= 1;
av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to %i",
s->opts.r);
}
if (!(s->opts.p & 1)) {
s->opts.p |= 1;
av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i",
s->opts.p);
}
for (int i = 0; i < 4; i++) {
double str = (s->opts.sc[i] > 1.0) ? s->opts.sc[i] : s->opts.s;
int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p);
str = 10.0f*str;
str *= -str;
str = 255.0*255.0 / str;
s->strength[i] = str;
if (!(ps & 1)) {
ps |= 1;
av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i",
ps);
}
s->patch[i] = ps / 2;
}
rad = s->opts.r/2;
s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
s->xoffsets = av_malloc(s->nb_offsets*sizeof(*s->xoffsets));
s->yoffsets = av_malloc(s->nb_offsets*sizeof(*s->yoffsets));
s->nb_offsets = 0;
for (int x = -rad; x <= rad; x++) {
for (int y = -rad; y <= rad; y++) {
if (!x && !y)
continue;
s->xoffsets[xcnt++] = x;
s->yoffsets[ycnt++] = y;
s->nb_offsets++;
}
}
RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL,
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0));
for (int i = 0; i < 2*s->nb_offsets; i += 2) {
offsets_buf[i + 0] = s->xoffsets[i >> 1];
offsets_buf[i + 1] = s->yoffsets[i >> 1];
}
RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, "
"disabling dispatch parallelism\n");
s->opts.t = 1;
}
spv = ff_vk_spirv_init();
if (!spv) {
av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
return AVERROR_EXTERNAL;
}
ff_vk_qf_init(vkctx, &s->qf, VK_QUEUE_COMPUTE_BIT);
RET(ff_vk_exec_pool_init(vkctx, &s->qf, &s->e, 1, 0, 0, 0, NULL));
RET(ff_vk_init_sampler(vkctx, &s->sampler, 1, VK_FILTER_NEAREST));
RET(init_weights_pipeline(vkctx, &s->e, &s->pl_weights, &s->shd_weights, s->sampler,
spv, s->vkctx.output_width, s->vkctx.output_height,
s->opts.t, desc, planes, &s->pl_weights_rows));
RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler,
spv, desc, planes));
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, NULL, 1, 0, 0,
s->xyoffsets_buf.address, s->xyoffsets_buf.size,
VK_FORMAT_UNDEFINED));
do {
int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
offsets_dispatched += wg_invoc * TYPE_ELEMS;
nb_dispatches++;
} while (offsets_dispatched < s->nb_offsets);
av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n",
s->nb_offsets, nb_dispatches);
s->initialized = 1;
fail:
if (spv)
spv->uninit(&spv);
return err;
}
static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec,
FFVkBuffer *ws_vk, uint32_t ws_stride[4])
{
FFVulkanContext *vkctx = &s->vkctx;
FFVulkanFunctions *vk = &vkctx->vkfn;
VkBufferMemoryBarrier2 buf_bar[8];
int nb_buf_bar = 0;
/* Denoise pass pipeline */
ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_denoise);
/* Push data */
ff_vk_update_push_exec(vkctx, exec, &s->pl_denoise, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(DenoisePushData), &(DenoisePushData) {
{ ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
});
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = ws_vk->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = ws_vk->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = ws_vk->buf,
.size = ws_vk->size,
.offset = 0,
};
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
.bufferMemoryBarrierCount = nb_buf_bar,
});
ws_vk->stage = buf_bar[0].dstStageMask;
ws_vk->access = buf_bar[0].dstAccessMask;
/* End of denoise pass */
vk->CmdDispatch(exec->buf,
FFALIGN(vkctx->output_width, s->pl_denoise.wg_size[0])/s->pl_denoise.wg_size[0],
FFALIGN(vkctx->output_height, s->pl_denoise.wg_size[1])/s->pl_denoise.wg_size[1],
av_pix_fmt_count_planes(s->vkctx.output_format));
return 0;
}
static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
{
int err;
AVFrame *out = NULL;
AVFilterContext *ctx = link->dst;
NLMeansVulkanContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0];
FFVulkanContext *vkctx = &s->vkctx;
FFVulkanFunctions *vk = &vkctx->vkfn;
const AVPixFmtDescriptor *desc;
int plane_widths[4];
int plane_heights[4];
int offsets_dispatched = 0;
/* Integral */
AVBufferRef *integral_buf = NULL;
FFVkBuffer *integral_vk;
size_t int_stride;
size_t int_size;
/* Weights/sums */
AVBufferRef *ws_buf = NULL;
FFVkBuffer *ws_vk;
VkDeviceAddress weights_addr[4];
VkDeviceAddress sums_addr[4];
uint32_t ws_stride[4];
size_t ws_size[4];
size_t ws_total_size = 0;
FFVkExecContext *exec;
VkImageView in_views[AV_NUM_DATA_POINTERS];
VkImageView out_views[AV_NUM_DATA_POINTERS];
VkImageMemoryBarrier2 img_bar[8];
int nb_img_bar = 0;
VkBufferMemoryBarrier2 buf_bar[8];
int nb_buf_bar = 0;
if (!s->initialized)
RET(init_filter(ctx));
desc = av_pix_fmt_desc_get(vkctx->output_format);
if (!desc)
return AVERROR(EINVAL);
/* Integral image */
int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows*TYPE_SIZE;
int_size = s->pl_weights.wg_size[0]*s->pl_weights_rows*int_stride;
/* Plane dimensions */
for (int i = 0; i < desc->nb_components; i++) {
plane_widths[i] = !i || (i == 3) ? vkctx->output_width : AV_CEIL_RSHIFT(vkctx->output_width, desc->log2_chroma_w);
plane_heights[i] = !i || (i == 3) ? vkctx->output_height : AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_w);
plane_widths[i] = FFALIGN(plane_widths[i], s->pl_denoise.wg_size[0]);
plane_heights[i] = FFALIGN(plane_heights[i], s->pl_denoise.wg_size[1]);
ws_stride[i] = plane_widths[i];
ws_size[i] = ws_stride[i] * plane_heights[i] * sizeof(float);
ws_total_size += ws_size[i];
}
/* Buffers */
err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, &integral_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
s->opts.t * int_size,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
if (err < 0)
return err;
integral_vk = (FFVkBuffer *)integral_buf->data;
err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL,
ws_total_size * 2,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
if (err < 0)
return err;
ws_vk = (FFVkBuffer *)ws_buf->data;
weights_addr[0] = ws_vk->address;
sums_addr[0] = ws_vk->address + ws_total_size;
for (int i = 1; i < desc->nb_components; i++) {
weights_addr[i] = weights_addr[i - 1] + ws_size[i - 1];
sums_addr[i] = sums_addr[i - 1] + ws_size[i - 1];
}
/* Output frame */
out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
if (!out) {
err = AVERROR(ENOMEM);
goto fail;
}
/* Execution context */
exec = ff_vk_exec_get(&s->e);
ff_vk_exec_start(vkctx, exec);
/* Dependencies */
RET(ff_vk_exec_add_dep_frame(vkctx, exec, in,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
RET(ff_vk_exec_add_dep_frame(vkctx, exec, out,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0));
integral_buf = NULL;
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf, 1, 0));
ws_buf = NULL;
/* Input frame prep */
RET(ff_vk_create_imageviews(vkctx, exec, in_views, in));
ff_vk_update_descriptor_img_array(vkctx, &s->pl_weights, exec, in, in_views, 0, 0,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
s->sampler);
ff_vk_frame_barrier(vkctx, exec, in, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_READ_BIT,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
VK_QUEUE_FAMILY_IGNORED);
/* Output frame prep */
RET(ff_vk_create_imageviews(vkctx, exec, out_views, out));
ff_vk_frame_barrier(vkctx, exec, out, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_WRITE_BIT,
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
nb_buf_bar = 0;
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = ws_vk->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
.srcAccessMask = ws_vk->access,
.dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = ws_vk->buf,
.size = ws_vk->size,
.offset = 0,
};
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = integral_vk->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = integral_vk->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = integral_vk->buf,
.size = integral_vk->size,
.offset = 0,
};
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pImageMemoryBarriers = img_bar,
.imageMemoryBarrierCount = nb_img_bar,
.pBufferMemoryBarriers = buf_bar,
.bufferMemoryBarrierCount = nb_buf_bar,
});
ws_vk->stage = buf_bar[0].dstStageMask;
ws_vk->access = buf_bar[0].dstAccessMask;
integral_vk->stage = buf_bar[1].dstStageMask;
integral_vk->access = buf_bar[1].dstAccessMask;
/* Buffer zeroing */
vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
nb_buf_bar = 0;
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = ws_vk->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = ws_vk->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = ws_vk->buf,
.size = ws_vk->size,
.offset = 0,
};
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
.bufferMemoryBarrierCount = nb_buf_bar,
});
ws_vk->stage = buf_bar[0].dstStageMask;
ws_vk->access = buf_bar[0].dstAccessMask;
/* Update weights descriptors */
ff_vk_update_descriptor_img_array(vkctx, &s->pl_weights, exec, in, in_views, 0, 0,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
s->sampler);
for (int i = 0; i < desc->nb_components; i++) {
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, exec, 0, 1 + i*2 + 0, 0,
weights_addr[i], ws_size[i],
VK_FORMAT_UNDEFINED));
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, exec, 0, 1 + i*2 + 1, 0,
sums_addr[i], ws_size[i],
VK_FORMAT_UNDEFINED));
}
/* Update denoise descriptors */
ff_vk_update_descriptor_img_array(vkctx, &s->pl_denoise, exec, in, in_views, 0, 0,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
s->sampler);
ff_vk_update_descriptor_img_array(vkctx, &s->pl_denoise, exec, out, out_views, 0, 1,
VK_IMAGE_LAYOUT_GENERAL, s->sampler);
for (int i = 0; i < desc->nb_components; i++) {
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_denoise, exec, 0, 2 + i*2 + 0, 0,
weights_addr[i], ws_size[i],
VK_FORMAT_UNDEFINED));
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_denoise, exec, 0, 2 + i*2 + 1, 0,
sums_addr[i], ws_size[i],
VK_FORMAT_UNDEFINED));
}
/* Weights pipeline */
ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights);
do {
int wg_invoc;
HorizontalPushData pd = {
{ plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
{ plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
{ ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
{ s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
{ s->strength[0], s->strength[1], s->strength[2], s->strength[2], },
integral_vk->address,
(uint64_t)int_size,
(uint64_t)int_stride,
offsets_dispatched,
};
if (offsets_dispatched) {
nb_buf_bar = 0;
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = integral_vk->stage,
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
.srcAccessMask = integral_vk->access,
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = integral_vk->buf,
.size = integral_vk->size,
.offset = 0,
};
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
.bufferMemoryBarrierCount = nb_buf_bar,
});
integral_vk->stage = buf_bar[1].dstStageMask;
integral_vk->access = buf_bar[1].dstAccessMask;
}
/* Push data */
ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pd), &pd);
wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
/* End of horizontal pass */
vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
offsets_dispatched += wg_invoc * TYPE_ELEMS;
} while (offsets_dispatched < s->nb_offsets);
RET(denoise_pass(s, exec, ws_vk, ws_stride));
err = ff_vk_exec_submit(vkctx, exec);
if (err < 0)
return err;
err = av_frame_copy_props(out, in);
if (err < 0)
goto fail;
av_frame_free(&in);
return ff_filter_frame(outlink, out);
fail:
av_buffer_unref(&integral_buf);
av_buffer_unref(&ws_buf);
av_frame_free(&in);
av_frame_free(&out);
return err;
}
static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
{
NLMeansVulkanContext *s = avctx->priv;
FFVulkanContext *vkctx = &s->vkctx;
FFVulkanFunctions *vk = &vkctx->vkfn;
ff_vk_exec_pool_free(vkctx, &s->e);
ff_vk_pipeline_free(vkctx, &s->pl_weights);
ff_vk_shader_free(vkctx, &s->shd_weights);
ff_vk_pipeline_free(vkctx, &s->pl_denoise);
ff_vk_shader_free(vkctx, &s->shd_denoise);
av_buffer_pool_uninit(&s->integral_buf_pool);
av_buffer_pool_uninit(&s->ws_buf_pool);
if (s->sampler)
vk->DestroySampler(vkctx->hwctx->act_dev, s->sampler,
vkctx->hwctx->alloc);
ff_vk_uninit(&s->vkctx);
av_freep(&s->xoffsets);
av_freep(&s->yoffsets);
s->initialized = 0;
}
#define OFFSET(x) offsetof(NLMeansVulkanContext, x)
#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
static const AVOption nlmeans_vulkan_options[] = {
{ "s", "denoising strength for all components", OFFSET(opts.s), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
{ "p", "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
{ "r", "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
{ "t", "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 36 }, 1, 168, FLAGS },
{ "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
{ "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
{ "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
{ "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
{ "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
{ "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
{ "p3", "patch size for component 3", OFFSET(opts.pc[2]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
{ "p4", "patch size for component 4", OFFSET(opts.pc[3]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
{ NULL }
};
AVFILTER_DEFINE_CLASS(nlmeans_vulkan);
static const AVFilterPad nlmeans_vulkan_inputs[] = {
{
.name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.filter_frame = &nlmeans_vulkan_filter_frame,
.config_props = &ff_vk_filter_config_input,
},
};
static const AVFilterPad nlmeans_vulkan_outputs[] = {
{
.name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.config_props = &ff_vk_filter_config_output,
},
};
const AVFilter ff_vf_nlmeans_vulkan = {
.name = "nlmeans_vulkan",
.description = NULL_IF_CONFIG_SMALL("Non-local means denoiser (Vulkan)"),
.priv_size = sizeof(NLMeansVulkanContext),
.init = &ff_vk_filter_init,
.uninit = &nlmeans_vulkan_uninit,
FILTER_INPUTS(nlmeans_vulkan_inputs),
FILTER_OUTPUTS(nlmeans_vulkan_outputs),
FILTER_SINGLE_PIXFMT(AV_PIX_FMT_VULKAN),
.priv_class = &nlmeans_vulkan_class,
.flags = AVFILTER_FLAG_HWDEVICE,
.flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
};