libavfilter: add vf_colorrange_cuda, CUDA-accelerated color conversion filter

Signed-off-by: Timo Rothenpieler <timo@rothenpieler.org>
This commit is contained in:
Roman Arzumanyan 2022-09-10 11:05:56 +03:00 committed by Timo Rothenpieler
parent c177108ae1
commit cc81ab283c
7 changed files with 570 additions and 3 deletions

6
configure vendored
View File

@ -3149,10 +3149,12 @@ qsvvpp_select="qsv"
vaapi_encode_deps="vaapi"
v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
chromakey_cuda_filter_deps="ffnvcodec"
chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
bilateral_cuda_filter_deps="ffnvcodec"
bilateral_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
chromakey_cuda_filter_deps="ffnvcodec"
chromakey_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
colorspace_cuda_filter_deps="ffnvcodec"
colorspace_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
hwupload_cuda_filter_deps="ffnvcodec"
scale_npp_filter_deps="ffnvcodec libnpp"
scale2ref_npp_filter_deps="ffnvcodec libnpp"

View File

@ -9725,6 +9725,38 @@ For example to convert the input to SMPTE-240M, use the command:
colorspace=smpte240m
@end example
@section colorspace_cuda
CUDA accelerated implementation of the colorspace filter.
It is by no means feature complete compared to the software colorspace filter,
and at the current time only supports color range conversion between jpeg/full
and mpeg/limited range.
The filter accepts the following options:
@table @option
@item range
Specify output color range.
The accepted values are:
@table @samp
@item tv
TV (restricted) range
@item mpeg
MPEG (restricted) range
@item pc
PC (full) range
@item jpeg
JPEG (full) range
@end table
@end table
@section colortemperature
Adjust color temperature in video to simulate variations in ambient color temperature.

View File

@ -230,6 +230,9 @@ OBJS-$(CONFIG_COLORLEVELS_FILTER) += vf_colorlevels.o
OBJS-$(CONFIG_COLORMAP_FILTER) += vf_colormap.o
OBJS-$(CONFIG_COLORMATRIX_FILTER) += vf_colormatrix.o
OBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o colorspacedsp.o
OBJS-$(CONFIG_COLORSPACE_CUDA_FILTER) += vf_colorspace_cuda.o \
vf_colorspace_cuda.ptx.o \
cuda/load_helper.o
OBJS-$(CONFIG_COLORTEMPERATURE_FILTER) += vf_colortemperature.o
OBJS-$(CONFIG_CONVOLUTION_FILTER) += vf_convolution.o
OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER) += vf_convolution_opencl.o opencl.o \

View File

@ -213,6 +213,7 @@ extern const AVFilter ff_vf_colorlevels;
extern const AVFilter ff_vf_colormap;
extern const AVFilter ff_vf_colormatrix;
extern const AVFilter ff_vf_colorspace;
extern const AVFilter ff_vf_colorspace_cuda;
extern const AVFilter ff_vf_colortemperature;
extern const AVFilter ff_vf_convolution;
extern const AVFilter ff_vf_convolution_opencl;

View File

@ -31,7 +31,7 @@
#include "version_major.h"
#define LIBAVFILTER_VERSION_MINOR 48
#define LIBAVFILTER_VERSION_MINOR 49
#define LIBAVFILTER_VERSION_MICRO 100

View File

@ -0,0 +1,435 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <string.h>
#include "libavutil/avstring.h"
#include "libavutil/common.h"
#include "libavutil/cuda_check.h"
#include "libavutil/hwcontext.h"
#include "libavutil/hwcontext_cuda_internal.h"
#include "libavutil/internal.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "avfilter.h"
#include "formats.h"
#include "internal.h"
#include "scale_eval.h"
#include "video.h"
#include "cuda/load_helper.h"
static const enum AVPixelFormat supported_formats[] = {
AV_PIX_FMT_NV12,
AV_PIX_FMT_YUV420P,
AV_PIX_FMT_YUV444P,
};
#define DIV_UP(a, b) (((a) + (b)-1) / (b))
#define BLOCKX 32
#define BLOCKY 16
#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
typedef struct CUDAColorspaceContext {
const AVClass* class;
AVCUDADeviceContext* hwctx;
AVBufferRef* frames_ctx;
AVFrame* own_frame;
AVFrame* tmp_frame;
CUcontext cu_ctx;
CUstream cu_stream;
CUmodule cu_module;
CUfunction cu_convert[AVCOL_RANGE_NB];
enum AVPixelFormat pix_fmt;
enum AVColorRange range;
int num_planes;
} CUDAColorspaceContext;
static av_cold int cudacolorspace_init(AVFilterContext* ctx)
{
CUDAColorspaceContext* s = ctx->priv;
s->own_frame = av_frame_alloc();
if (!s->own_frame)
return AVERROR(ENOMEM);
s->tmp_frame = av_frame_alloc();
if (!s->tmp_frame)
return AVERROR(ENOMEM);
return 0;
}
static av_cold void cudacolorspace_uninit(AVFilterContext* ctx)
{
CUDAColorspaceContext* s = ctx->priv;
if (s->hwctx && s->cu_module) {
CudaFunctions* cu = s->hwctx->internal->cuda_dl;
CUcontext dummy;
CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
CHECK_CU(cu->cuModuleUnload(s->cu_module));
s->cu_module = NULL;
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
}
av_frame_free(&s->own_frame);
av_buffer_unref(&s->frames_ctx);
av_frame_free(&s->tmp_frame);
}
static av_cold int init_hwframe_ctx(CUDAColorspaceContext* s, AVBufferRef* device_ctx,
int width, int height)
{
AVBufferRef* out_ref = NULL;
AVHWFramesContext* out_ctx;
int ret;
out_ref = av_hwframe_ctx_alloc(device_ctx);
if (!out_ref)
return AVERROR(ENOMEM);
out_ctx = (AVHWFramesContext*)out_ref->data;
out_ctx->format = AV_PIX_FMT_CUDA;
out_ctx->sw_format = s->pix_fmt;
out_ctx->width = FFALIGN(width, 32);
out_ctx->height = FFALIGN(height, 32);
ret = av_hwframe_ctx_init(out_ref);
if (ret < 0)
goto fail;
av_frame_unref(s->own_frame);
ret = av_hwframe_get_buffer(out_ref, s->own_frame, 0);
if (ret < 0)
goto fail;
s->own_frame->width = width;
s->own_frame->height = height;
av_buffer_unref(&s->frames_ctx);
s->frames_ctx = out_ref;
return 0;
fail:
av_buffer_unref(&out_ref);
return ret;
}
static int format_is_supported(enum AVPixelFormat fmt)
{
for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
if (fmt == supported_formats[i])
return 1;
return 0;
}
static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
int height)
{
CUDAColorspaceContext* s = ctx->priv;
AVHWFramesContext* in_frames_ctx;
int ret;
if (!ctx->inputs[0]->hw_frames_ctx) {
av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
return AVERROR(EINVAL);
}
in_frames_ctx = (AVHWFramesContext*)ctx->inputs[0]->hw_frames_ctx->data;
s->pix_fmt = in_frames_ctx->sw_format;
if (!format_is_supported(s->pix_fmt)) {
av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
av_get_pix_fmt_name(s->pix_fmt));
return AVERROR(EINVAL);
}
if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range)) {
av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
return AVERROR(EINVAL);
}
s->num_planes = av_pix_fmt_count_planes(s->pix_fmt);
ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
if (ret < 0)
return ret;
ctx->outputs[0]->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
if (!ctx->outputs[0]->hw_frames_ctx)
return AVERROR(ENOMEM);
return 0;
}
static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx)
{
CUDAColorspaceContext* s = ctx->priv;
CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
CudaFunctions* cu = s->hwctx->internal->cuda_dl;
int ret;
extern const unsigned char ff_vf_colorspace_cuda_ptx_data[];
extern const unsigned int ff_vf_colorspace_cuda_ptx_len;
ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
if (ret < 0)
return ret;
ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
ff_vf_colorspace_cuda_ptx_data,
ff_vf_colorspace_cuda_ptx_len);
if (ret < 0)
goto fail;
ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_MPEG], s->cu_module, "to_mpeg_cuda"));
if (ret < 0)
goto fail;
ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_JPEG], s->cu_module, "to_jpeg_cuda"));
if (ret < 0)
goto fail;
fail:
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
return ret;
}
static av_cold int cudacolorspace_config_props(AVFilterLink* outlink)
{
AVFilterContext* ctx = outlink->src;
AVFilterLink* inlink = outlink->src->inputs[0];
CUDAColorspaceContext* s = ctx->priv;
AVHWFramesContext* frames_ctx =
(AVHWFramesContext*)inlink->hw_frames_ctx->data;
AVCUDADeviceContext* device_hwctx = frames_ctx->device_ctx->hwctx;
int ret;
s->hwctx = device_hwctx;
s->cu_stream = s->hwctx->stream;
outlink->w = inlink->w;
outlink->h = inlink->h;
ret = init_processing_chain(ctx, inlink->w, inlink->h);
if (ret < 0)
return ret;
if (inlink->sample_aspect_ratio.num) {
outlink->sample_aspect_ratio = av_mul_q(
(AVRational){outlink->h * inlink->w, outlink->w * inlink->h},
inlink->sample_aspect_ratio);
} else {
outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
}
ret = cudacolorspace_load_functions(ctx);
if (ret < 0)
return ret;
return ret;
}
static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
{
CUDAColorspaceContext* s = ctx->priv;
CudaFunctions* cu = s->hwctx->internal->cuda_dl;
CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
int ret;
ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
if (ret < 0)
return ret;
out->color_range = s->range;
for (int i = 0; i < s->num_planes; i++) {
int width = in->width, height = in->height, comp_id = (i > 0);
switch (s->pix_fmt) {
case AV_PIX_FMT_YUV444P:
break;
case AV_PIX_FMT_YUV420P:
width = comp_id ? in->width / 2 : in->width;
case AV_PIX_FMT_NV12:
height = comp_id ? in->height / 2 : in->height;
break;
default:
av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
av_get_pix_fmt_name(s->pix_fmt));
return AVERROR(EINVAL);
}
if (!s->cu_convert[out->color_range]) {
av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
return AVERROR(EINVAL);
}
if (in->color_range != out->color_range) {
void* args[] = {&in->data[i], &out->data[i], &in->linesize[i],
&comp_id};
ret = CHECK_CU(cu->cuLaunchKernel(
s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),
DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
args, NULL));
} else {
ret = av_hwframe_transfer_data(out, in, 0);
if (ret < 0)
return ret;
}
}
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
return ret;
}
static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
{
CUDAColorspaceContext* s = ctx->priv;
AVFilterLink* outlink = ctx->outputs[0];
AVFrame* src = in;
int ret;
ret = conv_cuda_convert(ctx, s->own_frame, src);
if (ret < 0)
return ret;
src = s->own_frame;
ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
if (ret < 0)
return ret;
av_frame_move_ref(out, s->own_frame);
av_frame_move_ref(s->own_frame, s->tmp_frame);
s->own_frame->width = outlink->w;
s->own_frame->height = outlink->h;
ret = av_frame_copy_props(out, in);
if (ret < 0)
return ret;
return 0;
}
static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in)
{
AVFilterContext* ctx = link->dst;
CUDAColorspaceContext* s = ctx->priv;
AVFilterLink* outlink = ctx->outputs[0];
CudaFunctions* cu = s->hwctx->internal->cuda_dl;
AVFrame* out = NULL;
CUcontext dummy;
int ret = 0;
out = av_frame_alloc();
if (!out) {
ret = AVERROR(ENOMEM);
goto fail;
}
ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
if (ret < 0)
goto fail;
ret = cudacolorspace_conv(ctx, out, in);
CHECK_CU(cu->cuCtxPopCurrent(&dummy));
if (ret < 0)
goto fail;
av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
(int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
(int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
INT_MAX);
av_frame_free(&in);
return ff_filter_frame(outlink, out);
fail:
av_frame_free(&in);
av_frame_free(&out);
return ret;
}
#define OFFSET(x) offsetof(CUDAColorspaceContext, x)
#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
static const AVOption options[] = {
{"range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_NB - 1, FLAGS, "range"},
{"tv", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range"},
{"mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range"},
{"pc", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range"},
{"jpeg", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range"},
{NULL},
};
static const AVClass cudacolorspace_class = {
.class_name = "colorspace_cuda",
.item_name = av_default_item_name,
.option = options,
.version = LIBAVUTIL_VERSION_INT,
};
static const AVFilterPad cudacolorspace_inputs[] = {
{
.name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.filter_frame = cudacolorspace_filter_frame,
},
};
static const AVFilterPad cudacolorspace_outputs[] = {
{
.name = "default",
.type = AVMEDIA_TYPE_VIDEO,
.config_props = cudacolorspace_config_props,
},
};
const AVFilter ff_vf_colorspace_cuda = {
.name = "colorspace_cuda",
.description = NULL_IF_CONFIG_SMALL("CUDA accelerated video color converter"),
.init = cudacolorspace_init,
.uninit = cudacolorspace_uninit,
.priv_size = sizeof(CUDAColorspaceContext),
.priv_class = &cudacolorspace_class,
FILTER_INPUTS(cudacolorspace_inputs),
FILTER_OUTPUTS(cudacolorspace_outputs),
FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
.flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
};

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
extern "C" {
#define MPEG_LUMA_MIN (16)
#define MPEG_CHROMA_MIN (16)
#define MPEG_LUMA_MAX (235)
#define MPEG_CHROMA_MAX (240)
#define JPEG_LUMA_MIN (0)
#define JPEG_CHROMA_MIN (1)
#define JPEG_LUMA_MAX (255)
#define JPEG_CHROMA_MAX (255)
__device__ int mpeg_min[] = {MPEG_LUMA_MIN, MPEG_CHROMA_MIN};
__device__ int mpeg_max[] = {MPEG_LUMA_MAX, MPEG_CHROMA_MAX};
__device__ int jpeg_min[] = {JPEG_LUMA_MIN, JPEG_CHROMA_MIN};
__device__ int jpeg_max[] = {JPEG_LUMA_MAX, JPEG_CHROMA_MAX};
__device__ int clamp(int val, int min, int max)
{
if (val < min)
return min;
else if (val > max)
return max;
else
return val;
}
__global__ void to_jpeg_cuda(const unsigned char* src, unsigned char* dst,
int pitch, int comp_id)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int src_, dst_;
// 8 bit -> 15 bit for better precision
src_ = static_cast<int>(src[x + y * pitch]) << 7;
// Conversion
dst_ = comp_id ? (min(src_, 30775) * 4663 - 9289992) >> 12 // chroma
: (min(src_, 30189) * 19077 - 39057361) >> 14; // luma
// Dither replacement
dst_ = dst_ + 64;
// Back to 8 bit
dst_ = clamp(dst_ >> 7, jpeg_min[comp_id], jpeg_max[comp_id]);
dst[x + y * pitch] = static_cast<unsigned char>(dst_);
}
__global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst,
int pitch, int comp_id)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int src_, dst_;
// 8 bit -> 15 bit for better precision
src_ = static_cast<int>(src[x + y * pitch]) << 7;
// Conversion
dst_ = comp_id ? (src_ * 1799 + 4081085) >> 11 // chroma
: (src_ * 14071 + 33561947) >> 14; // luma
// Dither replacement
dst_ = dst_ + 64;
// Back to 8 bit
dst_ = clamp(dst_ >> 7, mpeg_min[comp_id], mpeg_max[comp_id]);
dst[x + y * pitch] = static_cast<unsigned char>(dst_);
}
}