rife-ncnn-vulkan/src/rife.cpp

4402 lines
169 KiB
C++

// rife implemented with ncnn library
#include "rife.h"
#include <algorithm>
#include <vector>
#include "benchmark.h"
#include "rife_preproc.comp.hex.h"
#include "rife_postproc.comp.hex.h"
#include "rife_preproc_tta.comp.hex.h"
#include "rife_postproc_tta.comp.hex.h"
#include "rife_flow_tta_avg.comp.hex.h"
#include "rife_v2_flow_tta_avg.comp.hex.h"
#include "rife_v4_flow_tta_avg.comp.hex.h"
#include "rife_flow_tta_temporal_avg.comp.hex.h"
#include "rife_v2_flow_tta_temporal_avg.comp.hex.h"
#include "rife_v4_flow_tta_temporal_avg.comp.hex.h"
#include "rife_out_tta_temporal_avg.comp.hex.h"
#include "rife_v4_timestep.comp.hex.h"
#include "rife_v4_timestep_tta.comp.hex.h"
#include "rife_ops.h"
DEFINE_LAYER_CREATOR(Warp)
RIFE::RIFE(int gpuid, bool _tta_mode, bool _tta_temporal_mode, bool _uhd_mode, int _num_threads, bool _rife_v2, bool _rife_v4)
{
vkdev = gpuid == -1 ? 0 : ncnn::get_gpu_device(gpuid);
rife_preproc = 0;
rife_postproc = 0;
rife_flow_tta_avg = 0;
rife_flow_tta_temporal_avg = 0;
rife_out_tta_temporal_avg = 0;
rife_v4_timestep = 0;
rife_uhd_downscale_image = 0;
rife_uhd_upscale_flow = 0;
rife_uhd_double_flow = 0;
rife_v2_slice_flow = 0;
tta_mode = _tta_mode;
tta_temporal_mode = _tta_temporal_mode;
uhd_mode = _uhd_mode;
num_threads = _num_threads;
rife_v2 = _rife_v2;
rife_v4 = _rife_v4;
}
RIFE::~RIFE()
{
// cleanup preprocess and postprocess pipeline
{
delete rife_preproc;
delete rife_postproc;
delete rife_flow_tta_avg;
delete rife_flow_tta_temporal_avg;
delete rife_out_tta_temporal_avg;
delete rife_v4_timestep;
}
if (uhd_mode)
{
rife_uhd_downscale_image->destroy_pipeline(flownet.opt);
delete rife_uhd_downscale_image;
rife_uhd_upscale_flow->destroy_pipeline(flownet.opt);
delete rife_uhd_upscale_flow;
rife_uhd_double_flow->destroy_pipeline(flownet.opt);
delete rife_uhd_double_flow;
}
if (rife_v2)
{
rife_v2_slice_flow->destroy_pipeline(flownet.opt);
delete rife_v2_slice_flow;
}
}
#if _WIN32
static void load_param_model(ncnn::Net& net, const std::wstring& modeldir, const wchar_t* name)
{
wchar_t parampath[256];
wchar_t modelpath[256];
swprintf(parampath, 256, L"%s/%s.param", modeldir.c_str(), name);
swprintf(modelpath, 256, L"%s/%s.bin", modeldir.c_str(), name);
{
FILE* fp = _wfopen(parampath, L"rb");
if (!fp)
{
fwprintf(stderr, L"_wfopen %ls failed\n", parampath);
}
net.load_param(fp);
fclose(fp);
}
{
FILE* fp = _wfopen(modelpath, L"rb");
if (!fp)
{
fwprintf(stderr, L"_wfopen %ls failed\n", modelpath);
}
net.load_model(fp);
fclose(fp);
}
}
#else
static void load_param_model(ncnn::Net& net, const std::string& modeldir, const char* name)
{
char parampath[256];
char modelpath[256];
sprintf(parampath, "%s/%s.param", modeldir.c_str(), name);
sprintf(modelpath, "%s/%s.bin", modeldir.c_str(), name);
net.load_param(parampath);
net.load_model(modelpath);
}
#endif
#if _WIN32
int RIFE::load(const std::wstring& modeldir)
#else
int RIFE::load(const std::string& modeldir)
#endif
{
ncnn::Option opt;
opt.num_threads = num_threads;
opt.use_vulkan_compute = vkdev ? true : false;
opt.use_fp16_packed = vkdev ? true : false;
opt.use_fp16_storage = vkdev ? true : false;
opt.use_fp16_arithmetic = false;
opt.use_int8_storage = true;
flownet.opt = opt;
contextnet.opt = opt;
fusionnet.opt = opt;
flownet.set_vulkan_device(vkdev);
contextnet.set_vulkan_device(vkdev);
fusionnet.set_vulkan_device(vkdev);
flownet.register_custom_layer("rife.Warp", Warp_layer_creator);
contextnet.register_custom_layer("rife.Warp", Warp_layer_creator);
fusionnet.register_custom_layer("rife.Warp", Warp_layer_creator);
#if _WIN32
load_param_model(flownet, modeldir, L"flownet");
if (!rife_v4)
{
load_param_model(contextnet, modeldir, L"contextnet");
load_param_model(fusionnet, modeldir, L"fusionnet");
}
#else
load_param_model(flownet, modeldir, "flownet");
if (!rife_v4)
{
load_param_model(contextnet, modeldir, "contextnet");
load_param_model(fusionnet, modeldir, "fusionnet");
}
#endif
// initialize preprocess and postprocess pipeline
if (vkdev)
{
std::vector<ncnn::vk_specialization_type> specializations(1);
#if _WIN32
specializations[0].i = 1;
#else
specializations[0].i = 0;
#endif
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
if (tta_mode)
compile_spirv_module(rife_preproc_tta_comp_data, sizeof(rife_preproc_tta_comp_data), opt, spirv);
else
compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
}
}
rife_preproc = new ncnn::Pipeline(vkdev);
rife_preproc->set_optimal_local_size_xyz(8, 8, 3);
rife_preproc->create(spirv.data(), spirv.size() * 4, specializations);
}
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
if (tta_mode)
compile_spirv_module(rife_postproc_tta_comp_data, sizeof(rife_postproc_tta_comp_data), opt, spirv);
else
compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
}
}
rife_postproc = new ncnn::Pipeline(vkdev);
rife_postproc->set_optimal_local_size_xyz(8, 8, 3);
rife_postproc->create(spirv.data(), spirv.size() * 4, specializations);
}
}
if (vkdev && tta_mode)
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
if (rife_v4)
{
compile_spirv_module(rife_v4_flow_tta_avg_comp_data, sizeof(rife_v4_flow_tta_avg_comp_data), opt, spirv);
}
else if (rife_v2)
{
compile_spirv_module(rife_v2_flow_tta_avg_comp_data, sizeof(rife_v2_flow_tta_avg_comp_data), opt, spirv);
}
else
{
compile_spirv_module(rife_flow_tta_avg_comp_data, sizeof(rife_flow_tta_avg_comp_data), opt, spirv);
}
}
}
std::vector<ncnn::vk_specialization_type> specializations(0);
rife_flow_tta_avg = new ncnn::Pipeline(vkdev);
rife_flow_tta_avg->set_optimal_local_size_xyz(8, 8, 1);
rife_flow_tta_avg->create(spirv.data(), spirv.size() * 4, specializations);
}
if (vkdev && tta_temporal_mode)
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
if (rife_v4)
{
compile_spirv_module(rife_v4_flow_tta_temporal_avg_comp_data, sizeof(rife_v4_flow_tta_temporal_avg_comp_data), opt, spirv);
}
else if (rife_v2)
{
compile_spirv_module(rife_v2_flow_tta_temporal_avg_comp_data, sizeof(rife_v2_flow_tta_temporal_avg_comp_data), opt, spirv);
}
else
{
compile_spirv_module(rife_flow_tta_temporal_avg_comp_data, sizeof(rife_flow_tta_temporal_avg_comp_data), opt, spirv);
}
}
}
std::vector<ncnn::vk_specialization_type> specializations(0);
rife_flow_tta_temporal_avg = new ncnn::Pipeline(vkdev);
rife_flow_tta_temporal_avg->set_optimal_local_size_xyz(8, 8, 1);
rife_flow_tta_temporal_avg->create(spirv.data(), spirv.size() * 4, specializations);
}
if (vkdev && tta_temporal_mode)
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
compile_spirv_module(rife_out_tta_temporal_avg_comp_data, sizeof(rife_out_tta_temporal_avg_comp_data), opt, spirv);
}
}
std::vector<ncnn::vk_specialization_type> specializations(0);
rife_out_tta_temporal_avg = new ncnn::Pipeline(vkdev);
rife_out_tta_temporal_avg->set_optimal_local_size_xyz(8, 8, 1);
rife_out_tta_temporal_avg->create(spirv.data(), spirv.size() * 4, specializations);
}
if (uhd_mode)
{
{
rife_uhd_downscale_image = ncnn::create_layer("Interp");
rife_uhd_downscale_image->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(0, 2);// bilinear
pd.set(1, 0.5f);
pd.set(2, 0.5f);
rife_uhd_downscale_image->load_param(pd);
rife_uhd_downscale_image->create_pipeline(opt);
}
{
rife_uhd_upscale_flow = ncnn::create_layer("Interp");
rife_uhd_upscale_flow->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(0, 2);// bilinear
pd.set(1, 2.f);
pd.set(2, 2.f);
rife_uhd_upscale_flow->load_param(pd);
rife_uhd_upscale_flow->create_pipeline(opt);
}
{
rife_uhd_double_flow = ncnn::create_layer("BinaryOp");
rife_uhd_double_flow->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(0, 2);// mul
pd.set(1, 1);// with_scalar
pd.set(2, 2.f);// b
rife_uhd_double_flow->load_param(pd);
rife_uhd_double_flow->create_pipeline(opt);
}
}
if (rife_v2)
{
{
rife_v2_slice_flow = ncnn::create_layer("Slice");
rife_v2_slice_flow->vkdev = vkdev;
ncnn::Mat slice_points(2);
slice_points.fill<int>(-233);
ncnn::ParamDict pd;
pd.set(0, slice_points);
pd.set(1, 0);// axis
rife_v2_slice_flow->load_param(pd);
rife_v2_slice_flow->create_pipeline(opt);
}
}
if (rife_v4)
{
if (vkdev)
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
if (tta_mode)
compile_spirv_module(rife_v4_timestep_tta_comp_data, sizeof(rife_v4_timestep_tta_comp_data), opt, spirv);
else
compile_spirv_module(rife_v4_timestep_comp_data, sizeof(rife_v4_timestep_comp_data), opt, spirv);
}
}
std::vector<ncnn::vk_specialization_type> specializations;
rife_v4_timestep = new ncnn::Pipeline(vkdev);
rife_v4_timestep->set_optimal_local_size_xyz(8, 8, 1);
rife_v4_timestep->create(spirv.data(), spirv.size() * 4, specializations);
}
}
return 0;
}
int RIFE::process(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float timestep, ncnn::Mat& outimage) const
{
if (!vkdev)
{
// cpu only
if (rife_v4)
return process_v4_cpu(in0image, in1image, timestep, outimage);
else
return process_cpu(in0image, in1image, timestep, outimage);
}
if (rife_v4)
return process_v4(in0image, in1image, timestep, outimage);
if (timestep == 0.f)
{
outimage = in0image;
return 0;
}
if (timestep == 1.f)
{
outimage = in1image;
return 0;
}
const unsigned char* pixel0data = (const unsigned char*)in0image.data;
const unsigned char* pixel1data = (const unsigned char*)in1image.data;
const int w = in0image.w;
const int h = in0image.h;
const int channels = 3;//in0image.elempack;
// fprintf(stderr, "%d x %d\n", w, h);
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
ncnn::Option opt = flownet.opt;
opt.blob_vkallocator = blob_vkallocator;
opt.workspace_vkallocator = blob_vkallocator;
opt.staging_vkallocator = staging_vkallocator;
// pad to 32n
int w_padded = (w + 31) / 32 * 32;
int h_padded = (h + 31) / 32 * 32;
const size_t in_out_tile_elemsize = opt.use_fp16_storage ? 2u : 4u;
ncnn::Mat in0;
ncnn::Mat in1;
if (opt.use_fp16_storage && opt.use_int8_storage)
{
in0 = ncnn::Mat(w, h, (unsigned char*)pixel0data, (size_t)channels, 1);
in1 = ncnn::Mat(w, h, (unsigned char*)pixel1data, (size_t)channels, 1);
}
else
{
#if _WIN32
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
#else
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_RGB, w, h);
#endif
}
ncnn::VkCompute cmd(vkdev);
// upload
ncnn::VkMat in0_gpu;
ncnn::VkMat in1_gpu;
{
cmd.record_clone(in0, in0_gpu, opt);
cmd.record_clone(in1, in1_gpu, opt);
}
ncnn::VkMat out_gpu;
if (tta_mode)
{
// preproc
ncnn::VkMat in0_gpu_padded[8];
ncnn::VkMat in1_gpu_padded[8];
{
in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded[0];
bindings[2] = in0_gpu_padded[1];
bindings[3] = in0_gpu_padded[2];
bindings[4] = in0_gpu_padded[3];
bindings[5] = in0_gpu_padded[4];
bindings[6] = in0_gpu_padded[5];
bindings[7] = in0_gpu_padded[6];
bindings[8] = in0_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded[0].w;
constants[4].i = in0_gpu_padded[0].h;
constants[5].i = in0_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]);
}
{
in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded[0];
bindings[2] = in1_gpu_padded[1];
bindings[3] = in1_gpu_padded[2];
bindings[4] = in1_gpu_padded[3];
bindings[5] = in1_gpu_padded[4];
bindings[6] = in1_gpu_padded[5];
bindings[7] = in1_gpu_padded[6];
bindings[8] = in1_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded[0].w;
constants[4].i = in1_gpu_padded[0].h;
constants[5].i = in1_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]);
}
ncnn::VkMat flow[8];
for (int ti = 0; ti < 8; ti++)
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
if (uhd_mode)
{
ncnn::VkMat in0_gpu_padded_downscaled;
ncnn::VkMat in1_gpu_padded_downscaled;
rife_uhd_downscale_image->forward(in0_gpu_padded[ti], in0_gpu_padded_downscaled, cmd, opt);
rife_uhd_downscale_image->forward(in1_gpu_padded[ti], in1_gpu_padded_downscaled, cmd, opt);
ex.input("input0", in0_gpu_padded_downscaled);
ex.input("input1", in1_gpu_padded_downscaled);
ncnn::VkMat flow_downscaled;
ex.extract("flow", flow_downscaled, cmd);
ncnn::VkMat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, cmd, opt);
rife_uhd_double_flow->forward(flow_half, flow[ti], cmd, opt);
}
else
{
ex.input("input0", in0_gpu_padded[ti]);
ex.input("input1", in1_gpu_padded[ti]);
ex.extract("flow", flow[ti], cmd);
}
}
ncnn::VkMat flow_reversed[8];
if (tta_temporal_mode)
{
for (int ti = 0; ti < 8; ti++)
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
if (uhd_mode)
{
ncnn::VkMat in0_gpu_padded_downscaled;
ncnn::VkMat in1_gpu_padded_downscaled;
rife_uhd_downscale_image->forward(in0_gpu_padded[ti], in0_gpu_padded_downscaled, cmd, opt);
rife_uhd_downscale_image->forward(in1_gpu_padded[ti], in1_gpu_padded_downscaled, cmd, opt);
ex.input("input0", in1_gpu_padded_downscaled);
ex.input("input1", in0_gpu_padded_downscaled);
ncnn::VkMat flow_downscaled;
ex.extract("flow", flow_downscaled, cmd);
ncnn::VkMat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, cmd, opt);
rife_uhd_double_flow->forward(flow_half, flow_reversed[ti], cmd, opt);
}
else
{
ex.input("input0", in1_gpu_padded[ti]);
ex.input("input1", in0_gpu_padded[ti]);
ex.extract("flow", flow_reversed[ti], cmd);
}
}
}
// avg flow
ncnn::VkMat flow0[8];
ncnn::VkMat flow1[8];
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow[0];
bindings[1] = flow[1];
bindings[2] = flow[2];
bindings[3] = flow[3];
bindings[4] = flow[4];
bindings[5] = flow[5];
bindings[6] = flow[6];
bindings[7] = flow[7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[0].w;
constants[1].i = flow[0].h;
constants[2].i = flow[0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[0].w;
dispatcher.h = flow[0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
}
if (tta_temporal_mode)
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow_reversed[0];
bindings[1] = flow_reversed[1];
bindings[2] = flow_reversed[2];
bindings[3] = flow_reversed[3];
bindings[4] = flow_reversed[4];
bindings[5] = flow_reversed[5];
bindings[6] = flow_reversed[6];
bindings[7] = flow_reversed[7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow_reversed[0].w;
constants[1].i = flow_reversed[0].h;
constants[2].i = flow_reversed[0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow_reversed[0].w;
dispatcher.h = flow_reversed[0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
// merge flow and flow_reversed
for (int ti = 0; ti < 8; ti++)
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = flow[ti];
bindings[1] = flow_reversed[ti];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[ti].w;
constants[1].i = flow[ti].h;
constants[2].i = flow[ti].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[ti].w;
dispatcher.h = flow[ti].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_temporal_avg, bindings, constants, dispatcher);
}
}
if (rife_v2)
{
for (int ti = 0; ti < 8; ti++)
{
std::vector<ncnn::VkMat> inputs(1);
inputs[0] = flow[ti];
std::vector<ncnn::VkMat> outputs(2);
rife_v2_slice_flow->forward(inputs, outputs, cmd, opt);
flow0[ti] = outputs[0];
flow1[ti] = outputs[1];
}
}
ncnn::VkMat out_gpu_padded[8];
for (int ti = 0; ti < 8; ti++)
{
// contextnet
ncnn::VkMat ctx0[4];
ncnn::VkMat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in0_gpu_padded[ti]);
if (rife_v2)
{
ex.input("flow.0", flow0[ti]);
}
else
{
ex.input("flow.0", flow[ti]);
}
ex.extract("f1", ctx0[0], cmd);
ex.extract("f2", ctx0[1], cmd);
ex.extract("f3", ctx0[2], cmd);
ex.extract("f4", ctx0[3], cmd);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in1_gpu_padded[ti]);
if (rife_v2)
{
ex.input("flow.0", flow1[ti]);
}
else
{
ex.input("flow.1", flow[ti]);
}
ex.extract("f1", ctx1[0], cmd);
ex.extract("f2", ctx1[1], cmd);
ex.extract("f3", ctx1[2], cmd);
ex.extract("f4", ctx1[3], cmd);
}
// fusionnet
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in0_gpu_padded[ti]);
ex.input("img1", in1_gpu_padded[ti]);
ex.input("flow", flow[ti]);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
// save some memory
if (!tta_temporal_mode)
{
if (ti == 0)
{
in0_gpu.release();
in1_gpu.release();
}
else
{
in0_gpu_padded[ti - 1].release();
in1_gpu_padded[ti - 1].release();
}
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
}
if (ti != 0)
{
flow[ti - 1].release();
}
ex.extract("output", out_gpu_padded[ti], cmd);
}
if (tta_temporal_mode)
{
// fusionnet
ncnn::VkMat out_gpu_padded_reversed;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in1_gpu_padded[ti]);
ex.input("img1", in0_gpu_padded[ti]);
ex.input("flow", flow_reversed[ti]);
ex.input("3", ctx1[0]);
ex.input("4", ctx1[1]);
ex.input("5", ctx1[2]);
ex.input("6", ctx1[3]);
ex.input("7", ctx0[0]);
ex.input("8", ctx0[1]);
ex.input("9", ctx0[2]);
ex.input("10", ctx0[3]);
// save some memory
if (ti == 0)
{
in0_gpu.release();
in1_gpu.release();
}
else
{
in0_gpu_padded[ti - 1].release();
in1_gpu_padded[ti - 1].release();
flow_reversed[ti - 1].release();
}
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
ex.extract("output", out_gpu_padded_reversed, cmd);
}
// merge output
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded[ti];
bindings[1] = out_gpu_padded_reversed;
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = out_gpu_padded[ti].w;
constants[1].i = out_gpu_padded[ti].h;
constants[2].i = out_gpu_padded[ti].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = out_gpu_padded[ti].w;
dispatcher.h = out_gpu_padded[ti].h;
dispatcher.c = 3;
cmd.record_pipeline(rife_out_tta_temporal_avg, bindings, constants, dispatcher);
}
}
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = out_gpu_padded[0];
bindings[1] = out_gpu_padded[1];
bindings[2] = out_gpu_padded[2];
bindings[3] = out_gpu_padded[3];
bindings[4] = out_gpu_padded[4];
bindings[5] = out_gpu_padded[5];
bindings[6] = out_gpu_padded[6];
bindings[7] = out_gpu_padded[7];
bindings[8] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded[0].w;
constants[1].i = out_gpu_padded[0].h;
constants[2].i = out_gpu_padded[0].cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
else
{
// preproc
ncnn::VkMat in0_gpu_padded;
ncnn::VkMat in1_gpu_padded;
{
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded.w;
constants[4].i = in0_gpu_padded.h;
constants[5].i = in0_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
}
{
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded.w;
constants[4].i = in1_gpu_padded.h;
constants[5].i = in1_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
}
// flownet
ncnn::VkMat flow;
ncnn::VkMat flow0;
ncnn::VkMat flow1;
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
if (uhd_mode)
{
ncnn::VkMat in0_gpu_padded_downscaled;
ncnn::VkMat in1_gpu_padded_downscaled;
rife_uhd_downscale_image->forward(in0_gpu_padded, in0_gpu_padded_downscaled, cmd, opt);
rife_uhd_downscale_image->forward(in1_gpu_padded, in1_gpu_padded_downscaled, cmd, opt);
ex.input("input0", in0_gpu_padded_downscaled);
ex.input("input1", in1_gpu_padded_downscaled);
ncnn::VkMat flow_downscaled;
ex.extract("flow", flow_downscaled, cmd);
ncnn::VkMat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, cmd, opt);
rife_uhd_double_flow->forward(flow_half, flow, cmd, opt);
}
else
{
ex.input("input0", in0_gpu_padded);
ex.input("input1", in1_gpu_padded);
ex.extract("flow", flow, cmd);
}
}
ncnn::VkMat flow_reversed;
if (tta_temporal_mode)
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
if (uhd_mode)
{
ncnn::VkMat in0_gpu_padded_downscaled;
ncnn::VkMat in1_gpu_padded_downscaled;
rife_uhd_downscale_image->forward(in0_gpu_padded, in0_gpu_padded_downscaled, cmd, opt);
rife_uhd_downscale_image->forward(in1_gpu_padded, in1_gpu_padded_downscaled, cmd, opt);
ex.input("input0", in1_gpu_padded_downscaled);
ex.input("input1", in0_gpu_padded_downscaled);
ncnn::VkMat flow_downscaled;
ex.extract("flow", flow_downscaled, cmd);
ncnn::VkMat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, cmd, opt);
rife_uhd_double_flow->forward(flow_half, flow_reversed, cmd, opt);
}
else
{
ex.input("input0", in1_gpu_padded);
ex.input("input1", in0_gpu_padded);
ex.extract("flow", flow_reversed, cmd);
}
// merge flow and flow_reversed
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = flow;
bindings[1] = flow_reversed;
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow.w;
constants[1].i = flow.h;
constants[2].i = flow.cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow.w;
dispatcher.h = flow.h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_temporal_avg, bindings, constants, dispatcher);
}
}
if (rife_v2)
{
std::vector<ncnn::VkMat> inputs(1);
inputs[0] = flow;
std::vector<ncnn::VkMat> outputs(2);
rife_v2_slice_flow->forward(inputs, outputs, cmd, opt);
flow0 = outputs[0];
flow1 = outputs[1];
}
// contextnet
ncnn::VkMat ctx0[4];
ncnn::VkMat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in0_gpu_padded);
if (rife_v2)
{
ex.input("flow.0", flow0);
}
else
{
ex.input("flow.0", flow);
}
ex.extract("f1", ctx0[0], cmd);
ex.extract("f2", ctx0[1], cmd);
ex.extract("f3", ctx0[2], cmd);
ex.extract("f4", ctx0[3], cmd);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in1_gpu_padded);
if (rife_v2)
{
ex.input("flow.0", flow1);
}
else
{
ex.input("flow.1", flow);
}
ex.extract("f1", ctx1[0], cmd);
ex.extract("f2", ctx1[1], cmd);
ex.extract("f3", ctx1[2], cmd);
ex.extract("f4", ctx1[3], cmd);
}
// fusionnet
ncnn::VkMat out_gpu_padded;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in0_gpu_padded);
ex.input("img1", in1_gpu_padded);
ex.input("flow", flow);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
if (!tta_temporal_mode)
{
// save some memory
in0_gpu.release();
in1_gpu.release();
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
}
flow.release();
ex.extract("output", out_gpu_padded, cmd);
}
if (tta_temporal_mode)
{
// fusionnet
ncnn::VkMat out_gpu_padded_reversed;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in1_gpu_padded);
ex.input("img1", in0_gpu_padded);
ex.input("flow", flow_reversed);
ex.input("3", ctx1[0]);
ex.input("4", ctx1[1]);
ex.input("5", ctx1[2]);
ex.input("6", ctx1[3]);
ex.input("7", ctx0[0]);
ex.input("8", ctx0[1]);
ex.input("9", ctx0[2]);
ex.input("10", ctx0[3]);
// save some memory
in0_gpu.release();
in1_gpu.release();
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
flow_reversed.release();
ex.extract("output", out_gpu_padded_reversed, cmd);
}
// merge output
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu_padded_reversed;
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
ncnn::VkMat dispatcher;
dispatcher.w = out_gpu_padded.w;
dispatcher.h = out_gpu_padded.h;
dispatcher.c = 3;
cmd.record_pipeline(rife_out_tta_temporal_avg, bindings, constants, dispatcher);
}
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
// download
{
ncnn::Mat out;
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out = ncnn::Mat(out_gpu.w, out_gpu.h, (unsigned char*)outimage.data, (size_t)channels, 1);
}
cmd.record_clone(out_gpu, out, opt);
cmd.submit_and_wait();
if (!(opt.use_fp16_storage && opt.use_int8_storage))
{
#if _WIN32
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB2BGR);
#else
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB);
#endif
}
}
vkdev->reclaim_blob_allocator(blob_vkallocator);
vkdev->reclaim_staging_allocator(staging_vkallocator);
return 0;
}
int RIFE::process_cpu(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float timestep, ncnn::Mat& outimage) const
{
if (timestep == 0.f)
{
outimage = in0image;
return 0;
}
if (timestep == 1.f)
{
outimage = in1image;
return 0;
}
const unsigned char* pixel0data = (const unsigned char*)in0image.data;
const unsigned char* pixel1data = (const unsigned char*)in1image.data;
const int w = in0image.w;
const int h = in0image.h;
const int channels = 3;//in0image.elempack;
// fprintf(stderr, "%d x %d\n", w, h);
ncnn::Option opt = flownet.opt;
// pad to 32n
int w_padded = (w + 31) / 32 * 32;
int h_padded = (h + 31) / 32 * 32;
ncnn::Mat in0;
ncnn::Mat in1;
{
#if _WIN32
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
#else
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_RGB, w, h);
#endif
}
ncnn::Mat out;
if (tta_mode)
{
// preproc and border padding
ncnn::Mat in0_padded[8];
ncnn::Mat in1_padded[8];
{
in0_padded[0].create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in0_padded[0].channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in0.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
in1_padded[0].create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in1_padded[0].channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in1.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
// the other 7 directions
{
in0_padded[1].create(w_padded, h_padded, 3);
in0_padded[2].create(w_padded, h_padded, 3);
in0_padded[3].create(w_padded, h_padded, 3);
in0_padded[4].create(h_padded, w_padded, 3);
in0_padded[5].create(h_padded, w_padded, 3);
in0_padded[6].create(h_padded, w_padded, 3);
in0_padded[7].create(h_padded, w_padded, 3);
for (int q = 0; q < 3; q++)
{
const ncnn::Mat in0_padded_0 = in0_padded[0].channel(q);
ncnn::Mat in0_padded_1 = in0_padded[1].channel(q);
ncnn::Mat in0_padded_2 = in0_padded[2].channel(q);
ncnn::Mat in0_padded_3 = in0_padded[3].channel(q);
ncnn::Mat in0_padded_4 = in0_padded[4].channel(q);
ncnn::Mat in0_padded_5 = in0_padded[5].channel(q);
ncnn::Mat in0_padded_6 = in0_padded[6].channel(q);
ncnn::Mat in0_padded_7 = in0_padded[7].channel(q);
for (int i = 0; i < h_padded; i++)
{
const float* outptr0 = in0_padded_0.row(i);
float* outptr1 = in0_padded_1.row(i) + w_padded - 1;
float* outptr2 = in0_padded_2.row(h_padded - 1 - i) + w_padded - 1;
float* outptr3 = in0_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w_padded; j++)
{
float* outptr4 = in0_padded_4.row(j) + i;
float* outptr5 = in0_padded_5.row(j) + h_padded - 1 - i;
float* outptr6 = in0_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
float* outptr7 = in0_padded_7.row(w_padded - 1 - j) + i;
float v = *outptr0++;
*outptr1-- = v;
*outptr2-- = v;
*outptr3++ = v;
*outptr4 = v;
*outptr5 = v;
*outptr6 = v;
*outptr7 = v;
}
}
}
}
{
in1_padded[1].create(w_padded, h_padded, 3);
in1_padded[2].create(w_padded, h_padded, 3);
in1_padded[3].create(w_padded, h_padded, 3);
in1_padded[4].create(h_padded, w_padded, 3);
in1_padded[5].create(h_padded, w_padded, 3);
in1_padded[6].create(h_padded, w_padded, 3);
in1_padded[7].create(h_padded, w_padded, 3);
for (int q = 0; q < 3; q++)
{
const ncnn::Mat in1_padded_0 = in1_padded[0].channel(q);
ncnn::Mat in1_padded_1 = in1_padded[1].channel(q);
ncnn::Mat in1_padded_2 = in1_padded[2].channel(q);
ncnn::Mat in1_padded_3 = in1_padded[3].channel(q);
ncnn::Mat in1_padded_4 = in1_padded[4].channel(q);
ncnn::Mat in1_padded_5 = in1_padded[5].channel(q);
ncnn::Mat in1_padded_6 = in1_padded[6].channel(q);
ncnn::Mat in1_padded_7 = in1_padded[7].channel(q);
for (int i = 0; i < h_padded; i++)
{
const float* outptr0 = in1_padded_0.row(i);
float* outptr1 = in1_padded_1.row(i) + w_padded - 1;
float* outptr2 = in1_padded_2.row(h_padded - 1 - i) + w_padded - 1;
float* outptr3 = in1_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w_padded; j++)
{
float* outptr4 = in1_padded_4.row(j) + i;
float* outptr5 = in1_padded_5.row(j) + h_padded - 1 - i;
float* outptr6 = in1_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
float* outptr7 = in1_padded_7.row(w_padded - 1 - j) + i;
float v = *outptr0++;
*outptr1-- = v;
*outptr2-- = v;
*outptr3++ = v;
*outptr4 = v;
*outptr5 = v;
*outptr6 = v;
*outptr7 = v;
}
}
}
}
ncnn::Mat flow[8];
for (int ti = 0; ti < 8; ti++)
{
// flownet
{
ncnn::Extractor ex = flownet.create_extractor();
if (uhd_mode)
{
ncnn::Mat in0_padded_downscaled;
ncnn::Mat in1_padded_downscaled;
rife_uhd_downscale_image->forward(in0_padded[ti], in0_padded_downscaled, opt);
rife_uhd_downscale_image->forward(in1_padded[ti], in1_padded_downscaled, opt);
ex.input("input0", in0_padded_downscaled);
ex.input("input1", in1_padded_downscaled);
ncnn::Mat flow_downscaled;
ex.extract("flow", flow_downscaled);
ncnn::Mat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, opt);
rife_uhd_double_flow->forward(flow_half, flow[ti], opt);
}
else
{
ex.input("input0", in0_padded[ti]);
ex.input("input1", in1_padded[ti]);
ex.extract("flow", flow[ti]);
}
}
}
ncnn::Mat flow_reversed[8];
if (tta_temporal_mode)
{
for (int ti = 0; ti < 8; ti++)
{
// flownet
{
ncnn::Extractor ex = flownet.create_extractor();
if (uhd_mode)
{
ncnn::Mat in0_padded_downscaled;
ncnn::Mat in1_padded_downscaled;
rife_uhd_downscale_image->forward(in0_padded[ti], in0_padded_downscaled, opt);
rife_uhd_downscale_image->forward(in1_padded[ti], in1_padded_downscaled, opt);
ex.input("input0", in1_padded_downscaled);
ex.input("input1", in0_padded_downscaled);
ncnn::Mat flow_downscaled;
ex.extract("flow", flow_downscaled);
ncnn::Mat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, opt);
rife_uhd_double_flow->forward(flow_half, flow_reversed[ti], opt);
}
else
{
ex.input("input0", in1_padded[ti]);
ex.input("input1", in0_padded[ti]);
ex.extract("flow", flow_reversed[ti]);
}
}
// merge flow and flow_reversed
{
float* flow_x = flow[ti].channel(0);
float* flow_y = flow[ti].channel(1);
float* flow_reversed_x = flow_reversed[ti].channel(0);
float* flow_reversed_y = flow_reversed[ti].channel(1);
if (rife_v2)
{
float* flow_z = flow[ti].channel(2);
float* flow_w = flow[ti].channel(3);
float* flow_reversed_z = flow_reversed[ti].channel(2);
float* flow_reversed_w = flow_reversed[ti].channel(3);
for (int i = 0; i < flow[ti].h; i++)
{
for (int j = 0; j < flow[ti].w; j++)
{
float x = (*flow_x + *flow_reversed_z) * 0.5f;
float y = (*flow_y + *flow_reversed_w) * 0.5f;
float z = (*flow_z + *flow_reversed_x) * 0.5f;
float w = (*flow_w + *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_z++ = z;
*flow_w++ = w;
*flow_reversed_x++ = z;
*flow_reversed_y++ = w;
*flow_reversed_z++ = x;
*flow_reversed_w++ = y;
}
}
}
else
{
for (int i = 0; i < flow[ti].h; i++)
{
for (int j = 0; j < flow[ti].w; j++)
{
float x = (*flow_x - *flow_reversed_x) * 0.5f;
float y = (*flow_y - *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_reversed_x++ = -x;
*flow_reversed_y++ = -y;
}
}
}
}
}
}
// avg flow
ncnn::Mat flow0[8];
ncnn::Mat flow1[8];
{
ncnn::Mat flow_x0 = flow[0].channel(0);
ncnn::Mat flow_x1 = flow[1].channel(0);
ncnn::Mat flow_x2 = flow[2].channel(0);
ncnn::Mat flow_x3 = flow[3].channel(0);
ncnn::Mat flow_x4 = flow[4].channel(0);
ncnn::Mat flow_x5 = flow[5].channel(0);
ncnn::Mat flow_x6 = flow[6].channel(0);
ncnn::Mat flow_x7 = flow[7].channel(0);
ncnn::Mat flow_y0 = flow[0].channel(1);
ncnn::Mat flow_y1 = flow[1].channel(1);
ncnn::Mat flow_y2 = flow[2].channel(1);
ncnn::Mat flow_y3 = flow[3].channel(1);
ncnn::Mat flow_y4 = flow[4].channel(1);
ncnn::Mat flow_y5 = flow[5].channel(1);
ncnn::Mat flow_y6 = flow[6].channel(1);
ncnn::Mat flow_y7 = flow[7].channel(1);
if (rife_v2)
{
ncnn::Mat flow_z0 = flow[0].channel(2);
ncnn::Mat flow_z1 = flow[1].channel(2);
ncnn::Mat flow_z2 = flow[2].channel(2);
ncnn::Mat flow_z3 = flow[3].channel(2);
ncnn::Mat flow_z4 = flow[4].channel(2);
ncnn::Mat flow_z5 = flow[5].channel(2);
ncnn::Mat flow_z6 = flow[6].channel(2);
ncnn::Mat flow_z7 = flow[7].channel(2);
ncnn::Mat flow_w0 = flow[0].channel(3);
ncnn::Mat flow_w1 = flow[1].channel(3);
ncnn::Mat flow_w2 = flow[2].channel(3);
ncnn::Mat flow_w3 = flow[3].channel(3);
ncnn::Mat flow_w4 = flow[4].channel(3);
ncnn::Mat flow_w5 = flow[5].channel(3);
ncnn::Mat flow_w6 = flow[6].channel(3);
ncnn::Mat flow_w7 = flow[7].channel(3);
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
float* z0 = flow_z0.row(i);
float* z1 = flow_z1.row(i) + flow_x0.w - 1;
float* z2 = flow_z2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* z3 = flow_z3.row(flow_x0.h - 1 - i);
float* w0 = flow_w0.row(i);
float* w1 = flow_w1.row(i) + flow_x0.w - 1;
float* w2 = flow_w2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* w3 = flow_w3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float* z4 = flow_z4.row(j) + i;
float* z5 = flow_z5.row(j) + flow_x0.h - 1 - i;
float* z6 = flow_z6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* z7 = flow_z7.row(flow_x0.w - 1 - j) + i;
float* w4 = flow_w4.row(j) + i;
float* w5 = flow_w5.row(j) + flow_x0.h - 1 - i;
float* w6 = flow_w6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* w7 = flow_w7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
float z = (*z0 + -*z1 + -*z2 + *z3 + *w4 + *w5 + -*w6 + -*w7) * 0.125f;
float w = (*w0 + *w1 + -*w2 + -*w3 + *z4 + -*z5 + -*z6 + *z7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
*z0++ = z;
*z1-- = -z;
*z2-- = -z;
*z3++ = z;
*z4 = w;
*z5 = -w;
*z6 = -w;
*z7 = w;
*w0++ = w;
*w1-- = w;
*w2-- = -w;
*w3++ = -w;
*w4 = z;
*w5 = z;
*w6 = -z;
*w7 = -z;
}
}
}
else
{
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
}
}
}
}
if (tta_temporal_mode)
{
ncnn::Mat flow_x0 = flow_reversed[0].channel(0);
ncnn::Mat flow_x1 = flow_reversed[1].channel(0);
ncnn::Mat flow_x2 = flow_reversed[2].channel(0);
ncnn::Mat flow_x3 = flow_reversed[3].channel(0);
ncnn::Mat flow_x4 = flow_reversed[4].channel(0);
ncnn::Mat flow_x5 = flow_reversed[5].channel(0);
ncnn::Mat flow_x6 = flow_reversed[6].channel(0);
ncnn::Mat flow_x7 = flow_reversed[7].channel(0);
ncnn::Mat flow_y0 = flow_reversed[0].channel(1);
ncnn::Mat flow_y1 = flow_reversed[1].channel(1);
ncnn::Mat flow_y2 = flow_reversed[2].channel(1);
ncnn::Mat flow_y3 = flow_reversed[3].channel(1);
ncnn::Mat flow_y4 = flow_reversed[4].channel(1);
ncnn::Mat flow_y5 = flow_reversed[5].channel(1);
ncnn::Mat flow_y6 = flow_reversed[6].channel(1);
ncnn::Mat flow_y7 = flow_reversed[7].channel(1);
if (rife_v2)
{
ncnn::Mat flow_z0 = flow_reversed[0].channel(2);
ncnn::Mat flow_z1 = flow_reversed[1].channel(2);
ncnn::Mat flow_z2 = flow_reversed[2].channel(2);
ncnn::Mat flow_z3 = flow_reversed[3].channel(2);
ncnn::Mat flow_z4 = flow_reversed[4].channel(2);
ncnn::Mat flow_z5 = flow_reversed[5].channel(2);
ncnn::Mat flow_z6 = flow_reversed[6].channel(2);
ncnn::Mat flow_z7 = flow_reversed[7].channel(2);
ncnn::Mat flow_w0 = flow_reversed[0].channel(3);
ncnn::Mat flow_w1 = flow_reversed[1].channel(3);
ncnn::Mat flow_w2 = flow_reversed[2].channel(3);
ncnn::Mat flow_w3 = flow_reversed[3].channel(3);
ncnn::Mat flow_w4 = flow_reversed[4].channel(3);
ncnn::Mat flow_w5 = flow_reversed[5].channel(3);
ncnn::Mat flow_w6 = flow_reversed[6].channel(3);
ncnn::Mat flow_w7 = flow_reversed[7].channel(3);
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
float* z0 = flow_z0.row(i);
float* z1 = flow_z1.row(i) + flow_x0.w - 1;
float* z2 = flow_z2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* z3 = flow_z3.row(flow_x0.h - 1 - i);
float* w0 = flow_w0.row(i);
float* w1 = flow_w1.row(i) + flow_x0.w - 1;
float* w2 = flow_w2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* w3 = flow_w3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float* z4 = flow_z4.row(j) + i;
float* z5 = flow_z5.row(j) + flow_x0.h - 1 - i;
float* z6 = flow_z6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* z7 = flow_z7.row(flow_x0.w - 1 - j) + i;
float* w4 = flow_w4.row(j) + i;
float* w5 = flow_w5.row(j) + flow_x0.h - 1 - i;
float* w6 = flow_w6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* w7 = flow_w7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
float z = (*z0 + -*z1 + -*z2 + *z3 + *w4 + *w5 + -*w6 + -*w7) * 0.125f;
float w = (*w0 + *w1 + -*w2 + -*w3 + *z4 + -*z5 + -*z6 + *z7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
*z0++ = z;
*z1-- = -z;
*z2-- = -z;
*z3++ = z;
*z4 = w;
*z5 = -w;
*z6 = -w;
*z7 = w;
*w0++ = w;
*w1-- = w;
*w2-- = -w;
*w3++ = -w;
*w4 = z;
*w5 = z;
*w6 = -z;
*w7 = -z;
}
}
}
else
{
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
}
}
}
// merge flow and flow_reversed
for (int ti = 0; ti < 8; ti++)
{
float* flow_x = flow[ti].channel(0);
float* flow_y = flow[ti].channel(1);
float* flow_reversed_x = flow_reversed[ti].channel(0);
float* flow_reversed_y = flow_reversed[ti].channel(1);
if (rife_v2)
{
float* flow_z = flow[ti].channel(2);
float* flow_w = flow[ti].channel(3);
float* flow_reversed_z = flow_reversed[ti].channel(2);
float* flow_reversed_w = flow_reversed[ti].channel(3);
for (int i = 0; i < flow[ti].h; i++)
{
for (int j = 0; j < flow[ti].w; j++)
{
float x = (*flow_x + *flow_reversed_z) * 0.5f;
float y = (*flow_y + *flow_reversed_w) * 0.5f;
float z = (*flow_z + *flow_reversed_x) * 0.5f;
float w = (*flow_w + *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_z++ = z;
*flow_w++ = w;
*flow_reversed_x++ = z;
*flow_reversed_y++ = w;
*flow_reversed_z++ = x;
*flow_reversed_w++ = y;
}
}
}
else
{
for (int i = 0; i < flow[ti].h; i++)
{
for (int j = 0; j < flow[ti].w; j++)
{
float x = (*flow_x - *flow_reversed_x) * 0.5f;
float y = (*flow_y - *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_reversed_x++ = -x;
*flow_reversed_y++ = -y;
}
}
}
}
}
if (rife_v2)
{
for (int ti = 0; ti < 8; ti++)
{
std::vector<ncnn::Mat> inputs(1);
inputs[0] = flow[ti];
std::vector<ncnn::Mat> outputs(2);
rife_v2_slice_flow->forward(inputs, outputs, opt);
flow0[ti] = outputs[0];
flow1[ti] = outputs[1];
}
}
ncnn::Mat out_padded[8];
ncnn::Mat out_padded_reversed[8];
for (int ti = 0; ti < 8; ti++)
{
// contextnet
ncnn::Mat ctx0[4];
ncnn::Mat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.input("input.1", in0_padded[ti]);
if (rife_v2)
{
ex.input("flow.0", flow0[ti]);
}
else
{
ex.input("flow.0", flow[ti]);
}
ex.extract("f1", ctx0[0]);
ex.extract("f2", ctx0[1]);
ex.extract("f3", ctx0[2]);
ex.extract("f4", ctx0[3]);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.input("input.1", in1_padded[ti]);
if (rife_v2)
{
ex.input("flow.0", flow1[ti]);
}
else
{
ex.input("flow.1", flow[ti]);
}
ex.extract("f1", ctx1[0]);
ex.extract("f2", ctx1[1]);
ex.extract("f3", ctx1[2]);
ex.extract("f4", ctx1[3]);
}
// fusionnet
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.input("img0", in0_padded[ti]);
ex.input("img1", in1_padded[ti]);
ex.input("flow", flow[ti]);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
ex.extract("output", out_padded[ti]);
}
if (tta_temporal_mode)
{
// fusionnet
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.input("img0", in1_padded[ti]);
ex.input("img1", in0_padded[ti]);
ex.input("flow", flow_reversed[ti]);
ex.input("3", ctx1[0]);
ex.input("4", ctx1[1]);
ex.input("5", ctx1[2]);
ex.input("6", ctx1[3]);
ex.input("7", ctx0[0]);
ex.input("8", ctx0[1]);
ex.input("9", ctx0[2]);
ex.input("10", ctx0[3]);
ex.extract("output", out_padded_reversed[ti]);
}
}
}
// cut padding and postproc
out.create(w, h, 3);
if (tta_temporal_mode)
{
for (int q = 0; q < 3; q++)
{
const ncnn::Mat out_padded_0 = out_padded[0].channel(q);
const ncnn::Mat out_padded_1 = out_padded[1].channel(q);
const ncnn::Mat out_padded_2 = out_padded[2].channel(q);
const ncnn::Mat out_padded_3 = out_padded[3].channel(q);
const ncnn::Mat out_padded_4 = out_padded[4].channel(q);
const ncnn::Mat out_padded_5 = out_padded[5].channel(q);
const ncnn::Mat out_padded_6 = out_padded[6].channel(q);
const ncnn::Mat out_padded_7 = out_padded[7].channel(q);
const ncnn::Mat out_padded_reversed_0 = out_padded_reversed[0].channel(q);
const ncnn::Mat out_padded_reversed_1 = out_padded_reversed[1].channel(q);
const ncnn::Mat out_padded_reversed_2 = out_padded_reversed[2].channel(q);
const ncnn::Mat out_padded_reversed_3 = out_padded_reversed[3].channel(q);
const ncnn::Mat out_padded_reversed_4 = out_padded_reversed[4].channel(q);
const ncnn::Mat out_padded_reversed_5 = out_padded_reversed[5].channel(q);
const ncnn::Mat out_padded_reversed_6 = out_padded_reversed[6].channel(q);
const ncnn::Mat out_padded_reversed_7 = out_padded_reversed[7].channel(q);
float* outptr = out.channel(q);
for (int i = 0; i < h; i++)
{
const float* ptr0 = out_padded_0.row(i);
const float* ptr1 = out_padded_1.row(i) + w_padded - 1;
const float* ptr2 = out_padded_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptr3 = out_padded_3.row(h_padded - 1 - i);
const float* ptrr0 = out_padded_reversed_0.row(i);
const float* ptrr1 = out_padded_reversed_1.row(i) + w_padded - 1;
const float* ptrr2 = out_padded_reversed_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptrr3 = out_padded_reversed_3.row(h_padded - 1 - i);
for (int j = 0; j < w; j++)
{
const float* ptr4 = out_padded_4.row(j) + i;
const float* ptr5 = out_padded_5.row(j) + h_padded - 1 - i;
const float* ptr6 = out_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptr7 = out_padded_7.row(w_padded - 1 - j) + i;
const float* ptrr4 = out_padded_reversed_4.row(j) + i;
const float* ptrr5 = out_padded_reversed_5.row(j) + h_padded - 1 - i;
const float* ptrr6 = out_padded_reversed_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptrr7 = out_padded_reversed_7.row(w_padded - 1 - j) + i;
float v = (*ptr0++ + *ptr1-- + *ptr2-- + *ptr3++ + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8;
float vr = (*ptrr0++ + *ptrr1-- + *ptrr2-- + *ptrr3++ + *ptrr4 + *ptrr5 + *ptrr6 + *ptrr7) / 8;
*outptr++ = (v + vr) * 0.5f * 255.f + 0.5f;
}
}
}
}
else
{
for (int q = 0; q < 3; q++)
{
const ncnn::Mat out_padded_0 = out_padded[0].channel(q);
const ncnn::Mat out_padded_1 = out_padded[1].channel(q);
const ncnn::Mat out_padded_2 = out_padded[2].channel(q);
const ncnn::Mat out_padded_3 = out_padded[3].channel(q);
const ncnn::Mat out_padded_4 = out_padded[4].channel(q);
const ncnn::Mat out_padded_5 = out_padded[5].channel(q);
const ncnn::Mat out_padded_6 = out_padded[6].channel(q);
const ncnn::Mat out_padded_7 = out_padded[7].channel(q);
float* outptr = out.channel(q);
for (int i = 0; i < h; i++)
{
const float* ptr0 = out_padded_0.row(i);
const float* ptr1 = out_padded_1.row(i) + w_padded - 1;
const float* ptr2 = out_padded_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptr3 = out_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w; j++)
{
const float* ptr4 = out_padded_4.row(j) + i;
const float* ptr5 = out_padded_5.row(j) + h_padded - 1 - i;
const float* ptr6 = out_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptr7 = out_padded_7.row(w_padded - 1 - j) + i;
float v = (*ptr0++ + *ptr1-- + *ptr2-- + *ptr3++ + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8;
*outptr++ = v * 255.f + 0.5f;
}
}
}
}
}
else
{
// preproc and border padding
ncnn::Mat in0_padded;
ncnn::Mat in1_padded;
{
in0_padded.create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in0_padded.channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in0.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
in1_padded.create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in1_padded.channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in1.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
// flownet
ncnn::Mat flow;
ncnn::Mat flow0;
ncnn::Mat flow1;
{
ncnn::Extractor ex = flownet.create_extractor();
if (uhd_mode)
{
ncnn::Mat in0_padded_downscaled;
ncnn::Mat in1_padded_downscaled;
rife_uhd_downscale_image->forward(in0_padded, in0_padded_downscaled, opt);
rife_uhd_downscale_image->forward(in1_padded, in1_padded_downscaled, opt);
ex.input("input0", in0_padded_downscaled);
ex.input("input1", in1_padded_downscaled);
ncnn::Mat flow_downscaled;
ex.extract("flow", flow_downscaled);
ncnn::Mat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, opt);
rife_uhd_double_flow->forward(flow_half, flow, opt);
}
else
{
ex.input("input0", in0_padded);
ex.input("input1", in1_padded);
ex.extract("flow", flow);
}
}
ncnn::Mat flow_reversed;
if (tta_temporal_mode)
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
if (uhd_mode)
{
ncnn::Mat in0_padded_downscaled;
ncnn::Mat in1_padded_downscaled;
rife_uhd_downscale_image->forward(in0_padded, in0_padded_downscaled, opt);
rife_uhd_downscale_image->forward(in1_padded, in1_padded_downscaled, opt);
ex.input("input0", in1_padded_downscaled);
ex.input("input1", in0_padded_downscaled);
ncnn::Mat flow_downscaled;
ex.extract("flow", flow_downscaled);
ncnn::Mat flow_half;
rife_uhd_upscale_flow->forward(flow_downscaled, flow_half, opt);
rife_uhd_double_flow->forward(flow_half, flow_reversed, opt);
}
else
{
ex.input("input0", in1_padded);
ex.input("input1", in0_padded);
ex.extract("flow", flow_reversed);
}
// merge flow and flow_reversed
{
float* flow_x = flow.channel(0);
float* flow_y = flow.channel(1);
float* flow_reversed_x = flow_reversed.channel(0);
float* flow_reversed_y = flow_reversed.channel(1);
if (rife_v2)
{
float* flow_z = flow.channel(2);
float* flow_w = flow.channel(3);
float* flow_reversed_z = flow_reversed.channel(2);
float* flow_reversed_w = flow_reversed.channel(3);
for (int i = 0; i < flow.h; i++)
{
for (int j = 0; j < flow.w; j++)
{
float x = (*flow_x + *flow_reversed_z) * 0.5f;
float y = (*flow_y + *flow_reversed_w) * 0.5f;
float z = (*flow_z + *flow_reversed_x) * 0.5f;
float w = (*flow_w + *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_z++ = z;
*flow_w++ = w;
*flow_reversed_x++ = z;
*flow_reversed_y++ = w;
*flow_reversed_z++ = x;
*flow_reversed_w++ = y;
}
}
}
else
{
for (int i = 0; i < flow.h; i++)
{
for (int j = 0; j < flow.w; j++)
{
float x = (*flow_x - *flow_reversed_x) * 0.5f;
float y = (*flow_y - *flow_reversed_y) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_reversed_x++ = -x;
*flow_reversed_y++ = -y;
}
}
}
}
}
if (rife_v2)
{
std::vector<ncnn::Mat> inputs(1);
inputs[0] = flow;
std::vector<ncnn::Mat> outputs(2);
rife_v2_slice_flow->forward(inputs, outputs, opt);
flow0 = outputs[0];
flow1 = outputs[1];
}
// contextnet
ncnn::Mat ctx0[4];
ncnn::Mat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.input("input.1", in0_padded);
if (rife_v2)
{
ex.input("flow.0", flow0);
}
else
{
ex.input("flow.0", flow);
}
ex.extract("f1", ctx0[0]);
ex.extract("f2", ctx0[1]);
ex.extract("f3", ctx0[2]);
ex.extract("f4", ctx0[3]);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.input("input.1", in1_padded);
if (rife_v2)
{
ex.input("flow.0", flow1);
}
else
{
ex.input("flow.1", flow);
}
ex.extract("f1", ctx1[0]);
ex.extract("f2", ctx1[1]);
ex.extract("f3", ctx1[2]);
ex.extract("f4", ctx1[3]);
}
// fusionnet
ncnn::Mat out_padded;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.input("img0", in0_padded);
ex.input("img1", in1_padded);
ex.input("flow", flow);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
ex.extract("output", out_padded);
}
ncnn::Mat out_padded_reversed;
if (tta_temporal_mode)
{
// fusionnet
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.input("img0", in1_padded);
ex.input("img1", in0_padded);
ex.input("flow", flow_reversed);
ex.input("3", ctx1[0]);
ex.input("4", ctx1[1]);
ex.input("5", ctx1[2]);
ex.input("6", ctx1[3]);
ex.input("7", ctx0[0]);
ex.input("8", ctx0[1]);
ex.input("9", ctx0[2]);
ex.input("10", ctx0[3]);
ex.extract("output", out_padded_reversed);
}
}
// cut padding and postproc
out.create(w, h, 3);
if (tta_temporal_mode)
{
for (int q = 0; q < 3; q++)
{
float* outptr = out.channel(q);
const float* ptr = out_padded.channel(q);
const float* ptr1 = out_padded_reversed.channel(q);
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
*outptr++ = (*ptr++ + *ptr1++) * 0.5f * 255.f + 0.5f;
}
}
}
}
else
{
for (int q = 0; q < 3; q++)
{
float* outptr = out.channel(q);
const float* ptr = out_padded.channel(q);
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
*outptr++ = *ptr++ * 255.f + 0.5f;
}
}
}
}
}
// download
{
#if _WIN32
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB2BGR);
#else
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB);
#endif
}
return 0;
}
int RIFE::process_v4(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float timestep, ncnn::Mat& outimage) const
{
if (!vkdev)
{
// cpu only
return process_cpu(in0image, in1image, timestep, outimage);
}
if (timestep == 0.f)
{
outimage = in0image;
return 0;
}
if (timestep == 1.f)
{
outimage = in1image;
return 0;
}
const unsigned char* pixel0data = (const unsigned char*)in0image.data;
const unsigned char* pixel1data = (const unsigned char*)in1image.data;
const int w = in0image.w;
const int h = in0image.h;
const int channels = 3;//in0image.elempack;
// fprintf(stderr, "%d x %d\n", w, h);
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
ncnn::Option opt = flownet.opt;
opt.blob_vkallocator = blob_vkallocator;
opt.workspace_vkallocator = blob_vkallocator;
opt.staging_vkallocator = staging_vkallocator;
// pad to 32n
int w_padded = (w + 31) / 32 * 32;
int h_padded = (h + 31) / 32 * 32;
const size_t in_out_tile_elemsize = opt.use_fp16_storage ? 2u : 4u;
ncnn::Mat in0;
ncnn::Mat in1;
if (opt.use_fp16_storage && opt.use_int8_storage)
{
in0 = ncnn::Mat(w, h, (unsigned char*)pixel0data, (size_t)channels, 1);
in1 = ncnn::Mat(w, h, (unsigned char*)pixel1data, (size_t)channels, 1);
}
else
{
#if _WIN32
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
#else
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_RGB, w, h);
#endif
}
ncnn::VkCompute cmd(vkdev);
// upload
ncnn::VkMat in0_gpu;
ncnn::VkMat in1_gpu;
{
cmd.record_clone(in0, in0_gpu, opt);
cmd.record_clone(in1, in1_gpu, opt);
}
ncnn::VkMat out_gpu;
if (tta_mode)
{
// preproc
ncnn::VkMat in0_gpu_padded[8];
ncnn::VkMat in1_gpu_padded[8];
ncnn::VkMat timestep_gpu_padded[2];
{
in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded[0];
bindings[2] = in0_gpu_padded[1];
bindings[3] = in0_gpu_padded[2];
bindings[4] = in0_gpu_padded[3];
bindings[5] = in0_gpu_padded[4];
bindings[6] = in0_gpu_padded[5];
bindings[7] = in0_gpu_padded[6];
bindings[8] = in0_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded[0].w;
constants[4].i = in0_gpu_padded[0].h;
constants[5].i = in0_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]);
}
{
in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded[0];
bindings[2] = in1_gpu_padded[1];
bindings[3] = in1_gpu_padded[2];
bindings[4] = in1_gpu_padded[3];
bindings[5] = in1_gpu_padded[4];
bindings[6] = in1_gpu_padded[5];
bindings[7] = in1_gpu_padded[6];
bindings[8] = in1_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded[0].w;
constants[4].i = in1_gpu_padded[0].h;
constants[5].i = in1_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]);
}
{
timestep_gpu_padded[0].create(w_padded, h_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
timestep_gpu_padded[1].create(h_padded, w_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = timestep_gpu_padded[0];
bindings[1] = timestep_gpu_padded[1];
std::vector<ncnn::vk_constant_type> constants(4);
constants[0].i = timestep_gpu_padded[0].w;
constants[1].i = timestep_gpu_padded[0].h;
constants[2].i = timestep_gpu_padded[0].cstep;
constants[3].f = timestep;
cmd.record_pipeline(rife_v4_timestep, bindings, constants, timestep_gpu_padded[0]);
}
ncnn::VkMat out_gpu_padded[8];
if (tta_temporal_mode)
{
ncnn::VkMat timestep_gpu_padded_reversed[2];
{
timestep_gpu_padded_reversed[0].create(w_padded, h_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
timestep_gpu_padded_reversed[1].create(h_padded, w_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = timestep_gpu_padded_reversed[0];
bindings[1] = timestep_gpu_padded_reversed[1];
std::vector<ncnn::vk_constant_type> constants(4);
constants[0].i = timestep_gpu_padded_reversed[0].w;
constants[1].i = timestep_gpu_padded_reversed[0].h;
constants[2].i = timestep_gpu_padded_reversed[0].cstep;
constants[3].f = 1.f - timestep;
cmd.record_pipeline(rife_v4_timestep, bindings, constants, timestep_gpu_padded_reversed[0]);
}
ncnn::VkMat flow[4][8];
ncnn::VkMat flow_reversed[4][8];
for (int fi = 0; fi < 4; fi++)
{
for (int ti = 0; ti < 8; ti++)
{
{
// flownet flow mask
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded[ti]);
ex.input("in1", in1_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2][ti]);
case 2: ex.input("flow1", flow[1][ti]);
case 1: ex.input("flow0", flow[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi][ti], cmd);
}
}
}
{
// flownet flow mask reversed
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in1_gpu_padded[ti]);
ex.input("in1", in0_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded_reversed[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow_reversed[2][ti]);
case 2: ex.input("flow1", flow_reversed[1][ti]);
case 1: ex.input("flow0", flow_reversed[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow_reversed[fi][ti], cmd);
}
}
}
// merge flow and flow_reversed
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = flow[fi][ti];
bindings[1] = flow_reversed[fi][ti];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[fi][ti].w;
constants[1].i = flow[fi][ti].h;
constants[2].i = flow[fi][ti].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[fi][ti].w;
dispatcher.h = flow[fi][ti].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_temporal_avg, bindings, constants, dispatcher);
}
}
// avg flow mask
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow[fi][0];
bindings[1] = flow[fi][1];
bindings[2] = flow[fi][2];
bindings[3] = flow[fi][3];
bindings[4] = flow[fi][4];
bindings[5] = flow[fi][5];
bindings[6] = flow[fi][6];
bindings[7] = flow[fi][7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[fi][0].w;
constants[1].i = flow[fi][0].h;
constants[2].i = flow[fi][0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[fi][0].w;
dispatcher.h = flow[fi][0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
}
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow_reversed[fi][0];
bindings[1] = flow_reversed[fi][1];
bindings[2] = flow_reversed[fi][2];
bindings[3] = flow_reversed[fi][3];
bindings[4] = flow_reversed[fi][4];
bindings[5] = flow_reversed[fi][5];
bindings[6] = flow_reversed[fi][6];
bindings[7] = flow_reversed[fi][7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow_reversed[fi][0].w;
constants[1].i = flow_reversed[fi][0].h;
constants[2].i = flow_reversed[fi][0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow_reversed[fi][0].w;
dispatcher.h = flow_reversed[fi][0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
}
}
ncnn::VkMat out_gpu_padded_reversed[8];
for (int ti = 0; ti < 8; ti++)
{
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded[ti]);
ex.input("in1", in1_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded[ti / 4]);
ex.input("flow0", flow[0][ti]);
ex.input("flow1", flow[1][ti]);
ex.input("flow2", flow[2][ti]);
ex.input("flow3", flow[3][ti]);
ex.extract("out0", out_gpu_padded[ti], cmd);
}
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in1_gpu_padded[ti]);
ex.input("in1", in0_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded_reversed[ti / 4]);
ex.input("flow0", flow_reversed[0][ti]);
ex.input("flow1", flow_reversed[1][ti]);
ex.input("flow2", flow_reversed[2][ti]);
ex.input("flow3", flow_reversed[3][ti]);
ex.extract("out0", out_gpu_padded_reversed[ti], cmd);
}
// merge output
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded[ti];
bindings[1] = out_gpu_padded_reversed[ti];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = out_gpu_padded[ti].w;
constants[1].i = out_gpu_padded[ti].h;
constants[2].i = out_gpu_padded[ti].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = out_gpu_padded[ti].w;
dispatcher.h = out_gpu_padded[ti].h;
dispatcher.c = 3;
cmd.record_pipeline(rife_out_tta_temporal_avg, bindings, constants, dispatcher);
}
}
}
else
{
ncnn::VkMat flow[4][8];
for (int fi = 0; fi < 4; fi++)
{
for (int ti = 0; ti < 8; ti++)
{
// flownet flow mask
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded[ti]);
ex.input("in1", in1_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2][ti]);
case 2: ex.input("flow1", flow[1][ti]);
case 1: ex.input("flow0", flow[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi][ti], cmd);
}
}
}
// avg flow mask
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow[fi][0];
bindings[1] = flow[fi][1];
bindings[2] = flow[fi][2];
bindings[3] = flow[fi][3];
bindings[4] = flow[fi][4];
bindings[5] = flow[fi][5];
bindings[6] = flow[fi][6];
bindings[7] = flow[fi][7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[fi][0].w;
constants[1].i = flow[fi][0].h;
constants[2].i = flow[fi][0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[fi][0].w;
dispatcher.h = flow[fi][0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
}
}
for (int ti = 0; ti < 8; ti++)
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded[ti]);
ex.input("in1", in1_gpu_padded[ti]);
ex.input("in2", timestep_gpu_padded[ti / 4]);
ex.input("flow0", flow[0][ti]);
ex.input("flow1", flow[1][ti]);
ex.input("flow2", flow[2][ti]);
ex.input("flow3", flow[3][ti]);
ex.extract("out0", out_gpu_padded[ti], cmd);
}
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = out_gpu_padded[0];
bindings[1] = out_gpu_padded[1];
bindings[2] = out_gpu_padded[2];
bindings[3] = out_gpu_padded[3];
bindings[4] = out_gpu_padded[4];
bindings[5] = out_gpu_padded[5];
bindings[6] = out_gpu_padded[6];
bindings[7] = out_gpu_padded[7];
bindings[8] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded[0].w;
constants[1].i = out_gpu_padded[0].h;
constants[2].i = out_gpu_padded[0].cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
else
{
// preproc
ncnn::VkMat in0_gpu_padded;
ncnn::VkMat in1_gpu_padded;
ncnn::VkMat timestep_gpu_padded;
{
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded.w;
constants[4].i = in0_gpu_padded.h;
constants[5].i = in0_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
}
{
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded.w;
constants[4].i = in1_gpu_padded.h;
constants[5].i = in1_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
}
{
timestep_gpu_padded.create(w_padded, h_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(1);
bindings[0] = timestep_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(4);
constants[0].i = timestep_gpu_padded.w;
constants[1].i = timestep_gpu_padded.h;
constants[2].i = timestep_gpu_padded.cstep;
constants[3].f = timestep;
cmd.record_pipeline(rife_v4_timestep, bindings, constants, timestep_gpu_padded);
}
ncnn::VkMat out_gpu_padded;
if (tta_temporal_mode)
{
ncnn::VkMat timestep_gpu_padded_reversed;
{
timestep_gpu_padded_reversed.create(w_padded, h_padded, 1, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(1);
bindings[0] = timestep_gpu_padded_reversed;
std::vector<ncnn::vk_constant_type> constants(4);
constants[0].i = timestep_gpu_padded_reversed.w;
constants[1].i = timestep_gpu_padded_reversed.h;
constants[2].i = timestep_gpu_padded_reversed.cstep;
constants[3].f = 1.f - timestep;
cmd.record_pipeline(rife_v4_timestep, bindings, constants, timestep_gpu_padded_reversed);
}
ncnn::VkMat flow[4];
ncnn::VkMat flow_reversed[4];
for (int fi = 0; fi < 4; fi++)
{
{
// flownet flow mask
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded);
ex.input("in1", in1_gpu_padded);
ex.input("in2", timestep_gpu_padded);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2]);
case 2: ex.input("flow1", flow[1]);
case 1: ex.input("flow0", flow[0]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi], cmd);
}
}
}
{
// flownet flow mask reversed
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in1_gpu_padded);
ex.input("in1", in0_gpu_padded);
ex.input("in2", timestep_gpu_padded_reversed);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow_reversed[2]);
case 2: ex.input("flow1", flow_reversed[1]);
case 1: ex.input("flow0", flow_reversed[0]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow_reversed[fi], cmd);
}
}
}
// merge flow and flow_reversed
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = flow[fi];
bindings[1] = flow_reversed[fi];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[fi].w;
constants[1].i = flow[fi].h;
constants[2].i = flow[fi].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[fi].w;
dispatcher.h = flow[fi].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_temporal_avg, bindings, constants, dispatcher);
}
}
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded);
ex.input("in1", in1_gpu_padded);
ex.input("in2", timestep_gpu_padded);
ex.input("flow0", flow[0]);
ex.input("flow1", flow[1]);
ex.input("flow2", flow[2]);
ex.input("flow3", flow[3]);
ex.extract("out0", out_gpu_padded, cmd);
}
ncnn::VkMat out_gpu_padded_reversed;
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in1_gpu_padded);
ex.input("in1", in0_gpu_padded);
ex.input("in2", timestep_gpu_padded_reversed);
ex.input("flow0", flow_reversed[0]);
ex.input("flow1", flow_reversed[1]);
ex.input("flow2", flow_reversed[2]);
ex.input("flow3", flow_reversed[3]);
ex.extract("out0", out_gpu_padded_reversed, cmd);
}
// merge output
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu_padded_reversed;
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
ncnn::VkMat dispatcher;
dispatcher.w = out_gpu_padded.w;
dispatcher.h = out_gpu_padded.h;
dispatcher.c = 3;
cmd.record_pipeline(rife_out_tta_temporal_avg, bindings, constants, dispatcher);
}
}
else
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("in0", in0_gpu_padded);
ex.input("in1", in1_gpu_padded);
ex.input("in2", timestep_gpu_padded);
ex.extract("out0", out_gpu_padded, cmd);
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
// download
{
ncnn::Mat out;
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out = ncnn::Mat(out_gpu.w, out_gpu.h, (unsigned char*)outimage.data, (size_t)channels, 1);
}
cmd.record_clone(out_gpu, out, opt);
cmd.submit_and_wait();
if (!(opt.use_fp16_storage && opt.use_int8_storage))
{
#if _WIN32
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB2BGR);
#else
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB);
#endif
}
}
vkdev->reclaim_blob_allocator(blob_vkallocator);
vkdev->reclaim_staging_allocator(staging_vkallocator);
return 0;
}
int RIFE::process_v4_cpu(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float timestep, ncnn::Mat& outimage) const
{
if (timestep == 0.f)
{
outimage = in0image;
return 0;
}
if (timestep == 1.f)
{
outimage = in1image;
return 0;
}
const unsigned char* pixel0data = (const unsigned char*)in0image.data;
const unsigned char* pixel1data = (const unsigned char*)in1image.data;
const int w = in0image.w;
const int h = in0image.h;
const int channels = 3;//in0image.elempack;
// fprintf(stderr, "%d x %d\n", w, h);
ncnn::Option opt = flownet.opt;
// pad to 32n
int w_padded = (w + 31) / 32 * 32;
int h_padded = (h + 31) / 32 * 32;
ncnn::Mat in0;
ncnn::Mat in1;
{
#if _WIN32
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
#else
in0 = ncnn::Mat::from_pixels(pixel0data, ncnn::Mat::PIXEL_RGB, w, h);
in1 = ncnn::Mat::from_pixels(pixel1data, ncnn::Mat::PIXEL_RGB, w, h);
#endif
}
ncnn::Mat out;
if (tta_mode)
{
// preproc and border padding
ncnn::Mat in0_padded[8];
ncnn::Mat in1_padded[8];
ncnn::Mat timestep_padded[2];
{
in0_padded[0].create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in0_padded[0].channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in0.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
in1_padded[0].create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in1_padded[0].channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in1.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
timestep_padded[0].create(w_padded, h_padded, 1);
timestep_padded[1].create(h_padded, w_padded, 1);
timestep_padded[0].fill(timestep);
timestep_padded[1].fill(timestep);
}
// the other 7 directions
{
in0_padded[1].create(w_padded, h_padded, 3);
in0_padded[2].create(w_padded, h_padded, 3);
in0_padded[3].create(w_padded, h_padded, 3);
in0_padded[4].create(h_padded, w_padded, 3);
in0_padded[5].create(h_padded, w_padded, 3);
in0_padded[6].create(h_padded, w_padded, 3);
in0_padded[7].create(h_padded, w_padded, 3);
for (int q = 0; q < 3; q++)
{
const ncnn::Mat in0_padded_0 = in0_padded[0].channel(q);
ncnn::Mat in0_padded_1 = in0_padded[1].channel(q);
ncnn::Mat in0_padded_2 = in0_padded[2].channel(q);
ncnn::Mat in0_padded_3 = in0_padded[3].channel(q);
ncnn::Mat in0_padded_4 = in0_padded[4].channel(q);
ncnn::Mat in0_padded_5 = in0_padded[5].channel(q);
ncnn::Mat in0_padded_6 = in0_padded[6].channel(q);
ncnn::Mat in0_padded_7 = in0_padded[7].channel(q);
for (int i = 0; i < h_padded; i++)
{
const float* outptr0 = in0_padded_0.row(i);
float* outptr1 = in0_padded_1.row(i) + w_padded - 1;
float* outptr2 = in0_padded_2.row(h_padded - 1 - i) + w_padded - 1;
float* outptr3 = in0_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w_padded; j++)
{
float* outptr4 = in0_padded_4.row(j) + i;
float* outptr5 = in0_padded_5.row(j) + h_padded - 1 - i;
float* outptr6 = in0_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
float* outptr7 = in0_padded_7.row(w_padded - 1 - j) + i;
float v = *outptr0++;
*outptr1-- = v;
*outptr2-- = v;
*outptr3++ = v;
*outptr4 = v;
*outptr5 = v;
*outptr6 = v;
*outptr7 = v;
}
}
}
}
{
in1_padded[1].create(w_padded, h_padded, 3);
in1_padded[2].create(w_padded, h_padded, 3);
in1_padded[3].create(w_padded, h_padded, 3);
in1_padded[4].create(h_padded, w_padded, 3);
in1_padded[5].create(h_padded, w_padded, 3);
in1_padded[6].create(h_padded, w_padded, 3);
in1_padded[7].create(h_padded, w_padded, 3);
for (int q = 0; q < 3; q++)
{
const ncnn::Mat in1_padded_0 = in1_padded[0].channel(q);
ncnn::Mat in1_padded_1 = in1_padded[1].channel(q);
ncnn::Mat in1_padded_2 = in1_padded[2].channel(q);
ncnn::Mat in1_padded_3 = in1_padded[3].channel(q);
ncnn::Mat in1_padded_4 = in1_padded[4].channel(q);
ncnn::Mat in1_padded_5 = in1_padded[5].channel(q);
ncnn::Mat in1_padded_6 = in1_padded[6].channel(q);
ncnn::Mat in1_padded_7 = in1_padded[7].channel(q);
for (int i = 0; i < h_padded; i++)
{
const float* outptr0 = in1_padded_0.row(i);
float* outptr1 = in1_padded_1.row(i) + w_padded - 1;
float* outptr2 = in1_padded_2.row(h_padded - 1 - i) + w_padded - 1;
float* outptr3 = in1_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w_padded; j++)
{
float* outptr4 = in1_padded_4.row(j) + i;
float* outptr5 = in1_padded_5.row(j) + h_padded - 1 - i;
float* outptr6 = in1_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
float* outptr7 = in1_padded_7.row(w_padded - 1 - j) + i;
float v = *outptr0++;
*outptr1-- = v;
*outptr2-- = v;
*outptr3++ = v;
*outptr4 = v;
*outptr5 = v;
*outptr6 = v;
*outptr7 = v;
}
}
}
}
ncnn::Mat out_padded[8];
ncnn::Mat out_padded_reversed[8];
if (tta_temporal_mode)
{
ncnn::Mat timestep_padded_reversed[2];
timestep_padded_reversed[0].create(w_padded, h_padded, 1);
timestep_padded_reversed[1].create(h_padded, w_padded, 1);
timestep_padded_reversed[0].fill(1.f - timestep);
timestep_padded_reversed[1].fill(1.f - timestep);
ncnn::Mat flow[4][8];
ncnn::Mat flow_reversed[4][8];
for (int fi = 0; fi < 4; fi++)
{
for (int ti = 0; ti < 8; ti++)
{
// flownet flow mask
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded[ti]);
ex.input("in1", in1_padded[ti]);
ex.input("in2", timestep_padded[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2][ti]);
case 2: ex.input("flow1", flow[1][ti]);
case 1: ex.input("flow0", flow[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi][ti]);
}
}
}
// flownet flow mask reversed
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in1_padded[ti]);
ex.input("in1", in0_padded[ti]);
ex.input("in2", timestep_padded_reversed[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow_reversed[2][ti]);
case 2: ex.input("flow1", flow_reversed[1][ti]);
case 1: ex.input("flow0", flow_reversed[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow_reversed[fi][ti]);
}
}
}
// merge flow and flow_reversed
{
float* flow_x = flow[fi][ti].channel(0);
float* flow_y = flow[fi][ti].channel(1);
float* flow_z = flow[fi][ti].channel(2);
float* flow_w = flow[fi][ti].channel(3);
float* flow_m = flow[fi][ti].channel(4);
float* flow_reversed_x = flow_reversed[fi][ti].channel(0);
float* flow_reversed_y = flow_reversed[fi][ti].channel(1);
float* flow_reversed_z = flow_reversed[fi][ti].channel(2);
float* flow_reversed_w = flow_reversed[fi][ti].channel(3);
float* flow_reversed_m = flow_reversed[fi][ti].channel(4);
for (int i = 0; i < flow[fi][ti].h; i++)
{
for (int j = 0; j < flow[fi][ti].w; j++)
{
float x = (*flow_x + *flow_reversed_z) * 0.5f;
float y = (*flow_y + *flow_reversed_w) * 0.5f;
float z = (*flow_z + *flow_reversed_x) * 0.5f;
float w = (*flow_w + *flow_reversed_y) * 0.5f;
float m = (*flow_m - *flow_reversed_m) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_z++ = z;
*flow_w++ = w;
*flow_m++ = m;
*flow_reversed_x++ = z;
*flow_reversed_y++ = w;
*flow_reversed_z++ = x;
*flow_reversed_w++ = y;
*flow_reversed_m++ = -m;
}
}
}
}
// avg flow mask
{
ncnn::Mat flow_x0 = flow[fi][0].channel(0);
ncnn::Mat flow_x1 = flow[fi][1].channel(0);
ncnn::Mat flow_x2 = flow[fi][2].channel(0);
ncnn::Mat flow_x3 = flow[fi][3].channel(0);
ncnn::Mat flow_x4 = flow[fi][4].channel(0);
ncnn::Mat flow_x5 = flow[fi][5].channel(0);
ncnn::Mat flow_x6 = flow[fi][6].channel(0);
ncnn::Mat flow_x7 = flow[fi][7].channel(0);
ncnn::Mat flow_y0 = flow[fi][0].channel(1);
ncnn::Mat flow_y1 = flow[fi][1].channel(1);
ncnn::Mat flow_y2 = flow[fi][2].channel(1);
ncnn::Mat flow_y3 = flow[fi][3].channel(1);
ncnn::Mat flow_y4 = flow[fi][4].channel(1);
ncnn::Mat flow_y5 = flow[fi][5].channel(1);
ncnn::Mat flow_y6 = flow[fi][6].channel(1);
ncnn::Mat flow_y7 = flow[fi][7].channel(1);
ncnn::Mat flow_z0 = flow[fi][0].channel(2);
ncnn::Mat flow_z1 = flow[fi][1].channel(2);
ncnn::Mat flow_z2 = flow[fi][2].channel(2);
ncnn::Mat flow_z3 = flow[fi][3].channel(2);
ncnn::Mat flow_z4 = flow[fi][4].channel(2);
ncnn::Mat flow_z5 = flow[fi][5].channel(2);
ncnn::Mat flow_z6 = flow[fi][6].channel(2);
ncnn::Mat flow_z7 = flow[fi][7].channel(2);
ncnn::Mat flow_w0 = flow[fi][0].channel(3);
ncnn::Mat flow_w1 = flow[fi][1].channel(3);
ncnn::Mat flow_w2 = flow[fi][2].channel(3);
ncnn::Mat flow_w3 = flow[fi][3].channel(3);
ncnn::Mat flow_w4 = flow[fi][4].channel(3);
ncnn::Mat flow_w5 = flow[fi][5].channel(3);
ncnn::Mat flow_w6 = flow[fi][6].channel(3);
ncnn::Mat flow_w7 = flow[fi][7].channel(3);
ncnn::Mat flow_m0 = flow[fi][0].channel(4);
ncnn::Mat flow_m1 = flow[fi][1].channel(4);
ncnn::Mat flow_m2 = flow[fi][2].channel(4);
ncnn::Mat flow_m3 = flow[fi][3].channel(4);
ncnn::Mat flow_m4 = flow[fi][4].channel(4);
ncnn::Mat flow_m5 = flow[fi][5].channel(4);
ncnn::Mat flow_m6 = flow[fi][6].channel(4);
ncnn::Mat flow_m7 = flow[fi][7].channel(4);
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
float* z0 = flow_z0.row(i);
float* z1 = flow_z1.row(i) + flow_x0.w - 1;
float* z2 = flow_z2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* z3 = flow_z3.row(flow_x0.h - 1 - i);
float* w0 = flow_w0.row(i);
float* w1 = flow_w1.row(i) + flow_x0.w - 1;
float* w2 = flow_w2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* w3 = flow_w3.row(flow_x0.h - 1 - i);
float* m0 = flow_m0.row(i);
float* m1 = flow_m1.row(i) + flow_x0.w - 1;
float* m2 = flow_m2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* m3 = flow_m3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float* z4 = flow_z4.row(j) + i;
float* z5 = flow_z5.row(j) + flow_x0.h - 1 - i;
float* z6 = flow_z6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* z7 = flow_z7.row(flow_x0.w - 1 - j) + i;
float* w4 = flow_w4.row(j) + i;
float* w5 = flow_w5.row(j) + flow_x0.h - 1 - i;
float* w6 = flow_w6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* w7 = flow_w7.row(flow_x0.w - 1 - j) + i;
float* m4 = flow_m4.row(j) + i;
float* m5 = flow_m5.row(j) + flow_x0.h - 1 - i;
float* m6 = flow_m6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* m7 = flow_m7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
float z = (*z0 + -*z1 + -*z2 + *z3 + *w4 + *w5 + -*w6 + -*w7) * 0.125f;
float w = (*w0 + *w1 + -*w2 + -*w3 + *z4 + -*z5 + -*z6 + *z7) * 0.125f;
float m = (*m0 + *m1 + *m2 + *m3 + *m4 + *m5 + *m6 + *m7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
*z0++ = z;
*z1-- = -z;
*z2-- = -z;
*z3++ = z;
*z4 = w;
*z5 = -w;
*z6 = -w;
*z7 = w;
*w0++ = w;
*w1-- = w;
*w2-- = -w;
*w3++ = -w;
*w4 = z;
*w5 = z;
*w6 = -z;
*w7 = -z;
*m0++ = m;
*m1-- = m;
*m2-- = m;
*m3++ = m;
*m4 = m;
*m5 = m;
*m6 = m;
*m7 = m;
}
}
}
{
ncnn::Mat flow_x0 = flow_reversed[fi][0].channel(0);
ncnn::Mat flow_x1 = flow_reversed[fi][1].channel(0);
ncnn::Mat flow_x2 = flow_reversed[fi][2].channel(0);
ncnn::Mat flow_x3 = flow_reversed[fi][3].channel(0);
ncnn::Mat flow_x4 = flow_reversed[fi][4].channel(0);
ncnn::Mat flow_x5 = flow_reversed[fi][5].channel(0);
ncnn::Mat flow_x6 = flow_reversed[fi][6].channel(0);
ncnn::Mat flow_x7 = flow_reversed[fi][7].channel(0);
ncnn::Mat flow_y0 = flow_reversed[fi][0].channel(1);
ncnn::Mat flow_y1 = flow_reversed[fi][1].channel(1);
ncnn::Mat flow_y2 = flow_reversed[fi][2].channel(1);
ncnn::Mat flow_y3 = flow_reversed[fi][3].channel(1);
ncnn::Mat flow_y4 = flow_reversed[fi][4].channel(1);
ncnn::Mat flow_y5 = flow_reversed[fi][5].channel(1);
ncnn::Mat flow_y6 = flow_reversed[fi][6].channel(1);
ncnn::Mat flow_y7 = flow_reversed[fi][7].channel(1);
ncnn::Mat flow_z0 = flow_reversed[fi][0].channel(2);
ncnn::Mat flow_z1 = flow_reversed[fi][1].channel(2);
ncnn::Mat flow_z2 = flow_reversed[fi][2].channel(2);
ncnn::Mat flow_z3 = flow_reversed[fi][3].channel(2);
ncnn::Mat flow_z4 = flow_reversed[fi][4].channel(2);
ncnn::Mat flow_z5 = flow_reversed[fi][5].channel(2);
ncnn::Mat flow_z6 = flow_reversed[fi][6].channel(2);
ncnn::Mat flow_z7 = flow_reversed[fi][7].channel(2);
ncnn::Mat flow_w0 = flow_reversed[fi][0].channel(3);
ncnn::Mat flow_w1 = flow_reversed[fi][1].channel(3);
ncnn::Mat flow_w2 = flow_reversed[fi][2].channel(3);
ncnn::Mat flow_w3 = flow_reversed[fi][3].channel(3);
ncnn::Mat flow_w4 = flow_reversed[fi][4].channel(3);
ncnn::Mat flow_w5 = flow_reversed[fi][5].channel(3);
ncnn::Mat flow_w6 = flow_reversed[fi][6].channel(3);
ncnn::Mat flow_w7 = flow_reversed[fi][7].channel(3);
ncnn::Mat flow_m0 = flow_reversed[fi][0].channel(4);
ncnn::Mat flow_m1 = flow_reversed[fi][1].channel(4);
ncnn::Mat flow_m2 = flow_reversed[fi][2].channel(4);
ncnn::Mat flow_m3 = flow_reversed[fi][3].channel(4);
ncnn::Mat flow_m4 = flow_reversed[fi][4].channel(4);
ncnn::Mat flow_m5 = flow_reversed[fi][5].channel(4);
ncnn::Mat flow_m6 = flow_reversed[fi][6].channel(4);
ncnn::Mat flow_m7 = flow_reversed[fi][7].channel(4);
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
float* z0 = flow_z0.row(i);
float* z1 = flow_z1.row(i) + flow_x0.w - 1;
float* z2 = flow_z2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* z3 = flow_z3.row(flow_x0.h - 1 - i);
float* w0 = flow_w0.row(i);
float* w1 = flow_w1.row(i) + flow_x0.w - 1;
float* w2 = flow_w2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* w3 = flow_w3.row(flow_x0.h - 1 - i);
float* m0 = flow_m0.row(i);
float* m1 = flow_m1.row(i) + flow_x0.w - 1;
float* m2 = flow_m2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* m3 = flow_m3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float* z4 = flow_z4.row(j) + i;
float* z5 = flow_z5.row(j) + flow_x0.h - 1 - i;
float* z6 = flow_z6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* z7 = flow_z7.row(flow_x0.w - 1 - j) + i;
float* w4 = flow_w4.row(j) + i;
float* w5 = flow_w5.row(j) + flow_x0.h - 1 - i;
float* w6 = flow_w6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* w7 = flow_w7.row(flow_x0.w - 1 - j) + i;
float* m4 = flow_m4.row(j) + i;
float* m5 = flow_m5.row(j) + flow_x0.h - 1 - i;
float* m6 = flow_m6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* m7 = flow_m7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
float z = (*z0 + -*z1 + -*z2 + *z3 + *w4 + *w5 + -*w6 + -*w7) * 0.125f;
float w = (*w0 + *w1 + -*w2 + -*w3 + *z4 + -*z5 + -*z6 + *z7) * 0.125f;
float m = (*m0 + *m1 + *m2 + *m3 + *m4 + *m5 + *m6 + *m7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
*z0++ = z;
*z1-- = -z;
*z2-- = -z;
*z3++ = z;
*z4 = w;
*z5 = -w;
*z6 = -w;
*z7 = w;
*w0++ = w;
*w1-- = w;
*w2-- = -w;
*w3++ = -w;
*w4 = z;
*w5 = z;
*w6 = -z;
*w7 = -z;
*m0++ = m;
*m1-- = m;
*m2-- = m;
*m3++ = m;
*m4 = m;
*m5 = m;
*m6 = m;
*m7 = m;
}
}
}
}
for (int ti = 0; ti < 8; ti++)
{
// flownet
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded[ti]);
ex.input("in1", in1_padded[ti]);
ex.input("in2", timestep_padded[ti / 4]);
ex.input("flow0", flow[0][ti]);
ex.input("flow1", flow[1][ti]);
ex.input("flow2", flow[2][ti]);
ex.input("flow3", flow[3][ti]);
ex.extract("out0", out_padded[ti]);
}
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in1_padded[ti]);
ex.input("in1", in0_padded[ti]);
ex.input("in2", timestep_padded_reversed[ti / 4]);
ex.input("flow0", flow_reversed[0][ti]);
ex.input("flow1", flow_reversed[1][ti]);
ex.input("flow2", flow_reversed[2][ti]);
ex.input("flow3", flow_reversed[3][ti]);
ex.extract("out0", out_padded_reversed[ti]);
}
}
}
else
{
ncnn::Mat flow[4][8];
for (int fi = 0; fi < 4; fi++)
{
for (int ti = 0; ti < 8; ti++)
{
// flownet flow mask
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded[ti]);
ex.input("in1", in1_padded[ti]);
ex.input("in2", timestep_padded[ti / 4]);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2][ti]);
case 2: ex.input("flow1", flow[1][ti]);
case 1: ex.input("flow0", flow[0][ti]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi][ti]);
}
}
}
// avg flow mask
{
ncnn::Mat flow_x0 = flow[fi][0].channel(0);
ncnn::Mat flow_x1 = flow[fi][1].channel(0);
ncnn::Mat flow_x2 = flow[fi][2].channel(0);
ncnn::Mat flow_x3 = flow[fi][3].channel(0);
ncnn::Mat flow_x4 = flow[fi][4].channel(0);
ncnn::Mat flow_x5 = flow[fi][5].channel(0);
ncnn::Mat flow_x6 = flow[fi][6].channel(0);
ncnn::Mat flow_x7 = flow[fi][7].channel(0);
ncnn::Mat flow_y0 = flow[fi][0].channel(1);
ncnn::Mat flow_y1 = flow[fi][1].channel(1);
ncnn::Mat flow_y2 = flow[fi][2].channel(1);
ncnn::Mat flow_y3 = flow[fi][3].channel(1);
ncnn::Mat flow_y4 = flow[fi][4].channel(1);
ncnn::Mat flow_y5 = flow[fi][5].channel(1);
ncnn::Mat flow_y6 = flow[fi][6].channel(1);
ncnn::Mat flow_y7 = flow[fi][7].channel(1);
ncnn::Mat flow_z0 = flow[fi][0].channel(2);
ncnn::Mat flow_z1 = flow[fi][1].channel(2);
ncnn::Mat flow_z2 = flow[fi][2].channel(2);
ncnn::Mat flow_z3 = flow[fi][3].channel(2);
ncnn::Mat flow_z4 = flow[fi][4].channel(2);
ncnn::Mat flow_z5 = flow[fi][5].channel(2);
ncnn::Mat flow_z6 = flow[fi][6].channel(2);
ncnn::Mat flow_z7 = flow[fi][7].channel(2);
ncnn::Mat flow_w0 = flow[fi][0].channel(3);
ncnn::Mat flow_w1 = flow[fi][1].channel(3);
ncnn::Mat flow_w2 = flow[fi][2].channel(3);
ncnn::Mat flow_w3 = flow[fi][3].channel(3);
ncnn::Mat flow_w4 = flow[fi][4].channel(3);
ncnn::Mat flow_w5 = flow[fi][5].channel(3);
ncnn::Mat flow_w6 = flow[fi][6].channel(3);
ncnn::Mat flow_w7 = flow[fi][7].channel(3);
ncnn::Mat flow_m0 = flow[fi][0].channel(4);
ncnn::Mat flow_m1 = flow[fi][1].channel(4);
ncnn::Mat flow_m2 = flow[fi][2].channel(4);
ncnn::Mat flow_m3 = flow[fi][3].channel(4);
ncnn::Mat flow_m4 = flow[fi][4].channel(4);
ncnn::Mat flow_m5 = flow[fi][5].channel(4);
ncnn::Mat flow_m6 = flow[fi][6].channel(4);
ncnn::Mat flow_m7 = flow[fi][7].channel(4);
for (int i = 0; i < flow_x0.h; i++)
{
float* x0 = flow_x0.row(i);
float* x1 = flow_x1.row(i) + flow_x0.w - 1;
float* x2 = flow_x2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* x3 = flow_x3.row(flow_x0.h - 1 - i);
float* y0 = flow_y0.row(i);
float* y1 = flow_y1.row(i) + flow_x0.w - 1;
float* y2 = flow_y2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* y3 = flow_y3.row(flow_x0.h - 1 - i);
float* z0 = flow_z0.row(i);
float* z1 = flow_z1.row(i) + flow_x0.w - 1;
float* z2 = flow_z2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* z3 = flow_z3.row(flow_x0.h - 1 - i);
float* w0 = flow_w0.row(i);
float* w1 = flow_w1.row(i) + flow_x0.w - 1;
float* w2 = flow_w2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* w3 = flow_w3.row(flow_x0.h - 1 - i);
float* m0 = flow_m0.row(i);
float* m1 = flow_m1.row(i) + flow_x0.w - 1;
float* m2 = flow_m2.row(flow_x0.h - 1 - i) + flow_x0.w - 1;
float* m3 = flow_m3.row(flow_x0.h - 1 - i);
for (int j = 0; j < flow_x0.w; j++)
{
float* x4 = flow_x4.row(j) + i;
float* x5 = flow_x5.row(j) + flow_x0.h - 1 - i;
float* x6 = flow_x6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* x7 = flow_x7.row(flow_x0.w - 1 - j) + i;
float* y4 = flow_y4.row(j) + i;
float* y5 = flow_y5.row(j) + flow_x0.h - 1 - i;
float* y6 = flow_y6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* y7 = flow_y7.row(flow_x0.w - 1 - j) + i;
float* z4 = flow_z4.row(j) + i;
float* z5 = flow_z5.row(j) + flow_x0.h - 1 - i;
float* z6 = flow_z6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* z7 = flow_z7.row(flow_x0.w - 1 - j) + i;
float* w4 = flow_w4.row(j) + i;
float* w5 = flow_w5.row(j) + flow_x0.h - 1 - i;
float* w6 = flow_w6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* w7 = flow_w7.row(flow_x0.w - 1 - j) + i;
float* m4 = flow_m4.row(j) + i;
float* m5 = flow_m5.row(j) + flow_x0.h - 1 - i;
float* m6 = flow_m6.row(flow_x0.w - 1 - j) + flow_x0.h - 1 - i;
float* m7 = flow_m7.row(flow_x0.w - 1 - j) + i;
float x = (*x0 + -*x1 + -*x2 + *x3 + *y4 + *y5 + -*y6 + -*y7) * 0.125f;
float y = (*y0 + *y1 + -*y2 + -*y3 + *x4 + -*x5 + -*x6 + *x7) * 0.125f;
float z = (*z0 + -*z1 + -*z2 + *z3 + *w4 + *w5 + -*w6 + -*w7) * 0.125f;
float w = (*w0 + *w1 + -*w2 + -*w3 + *z4 + -*z5 + -*z6 + *z7) * 0.125f;
float m = (*m0 + *m1 + *m2 + *m3 + *m4 + *m5 + *m6 + *m7) * 0.125f;
*x0++ = x;
*x1-- = -x;
*x2-- = -x;
*x3++ = x;
*x4 = y;
*x5 = -y;
*x6 = -y;
*x7 = y;
*y0++ = y;
*y1-- = y;
*y2-- = -y;
*y3++ = -y;
*y4 = x;
*y5 = x;
*y6 = -x;
*y7 = -x;
*z0++ = z;
*z1-- = -z;
*z2-- = -z;
*z3++ = z;
*z4 = w;
*z5 = -w;
*z6 = -w;
*z7 = w;
*w0++ = w;
*w1-- = w;
*w2-- = -w;
*w3++ = -w;
*w4 = z;
*w5 = z;
*w6 = -z;
*w7 = -z;
*m0++ = m;
*m1-- = m;
*m2-- = m;
*m3++ = m;
*m4 = m;
*m5 = m;
*m6 = m;
*m7 = m;
}
}
}
}
for (int ti = 0; ti < 8; ti++)
{
// flownet
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded[ti]);
ex.input("in1", in1_padded[ti]);
ex.input("in2", timestep_padded[ti / 4]);
ex.input("flow0", flow[0][ti]);
ex.input("flow1", flow[1][ti]);
ex.input("flow2", flow[2][ti]);
ex.input("flow3", flow[3][ti]);
ex.extract("out0", out_padded[ti]);
}
}
}
// cut padding and postproc
out.create(w, h, 3);
if (tta_temporal_mode)
{
for (int q = 0; q < 3; q++)
{
const ncnn::Mat out_padded_0 = out_padded[0].channel(q);
const ncnn::Mat out_padded_1 = out_padded[1].channel(q);
const ncnn::Mat out_padded_2 = out_padded[2].channel(q);
const ncnn::Mat out_padded_3 = out_padded[3].channel(q);
const ncnn::Mat out_padded_4 = out_padded[4].channel(q);
const ncnn::Mat out_padded_5 = out_padded[5].channel(q);
const ncnn::Mat out_padded_6 = out_padded[6].channel(q);
const ncnn::Mat out_padded_7 = out_padded[7].channel(q);
const ncnn::Mat out_padded_reversed_0 = out_padded_reversed[0].channel(q);
const ncnn::Mat out_padded_reversed_1 = out_padded_reversed[1].channel(q);
const ncnn::Mat out_padded_reversed_2 = out_padded_reversed[2].channel(q);
const ncnn::Mat out_padded_reversed_3 = out_padded_reversed[3].channel(q);
const ncnn::Mat out_padded_reversed_4 = out_padded_reversed[4].channel(q);
const ncnn::Mat out_padded_reversed_5 = out_padded_reversed[5].channel(q);
const ncnn::Mat out_padded_reversed_6 = out_padded_reversed[6].channel(q);
const ncnn::Mat out_padded_reversed_7 = out_padded_reversed[7].channel(q);
float* outptr = out.channel(q);
for (int i = 0; i < h; i++)
{
const float* ptr0 = out_padded_0.row(i);
const float* ptr1 = out_padded_1.row(i) + w_padded - 1;
const float* ptr2 = out_padded_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptr3 = out_padded_3.row(h_padded - 1 - i);
const float* ptrr0 = out_padded_reversed_0.row(i);
const float* ptrr1 = out_padded_reversed_1.row(i) + w_padded - 1;
const float* ptrr2 = out_padded_reversed_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptrr3 = out_padded_reversed_3.row(h_padded - 1 - i);
for (int j = 0; j < w; j++)
{
const float* ptr4 = out_padded_4.row(j) + i;
const float* ptr5 = out_padded_5.row(j) + h_padded - 1 - i;
const float* ptr6 = out_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptr7 = out_padded_7.row(w_padded - 1 - j) + i;
const float* ptrr4 = out_padded_reversed_4.row(j) + i;
const float* ptrr5 = out_padded_reversed_5.row(j) + h_padded - 1 - i;
const float* ptrr6 = out_padded_reversed_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptrr7 = out_padded_reversed_7.row(w_padded - 1 - j) + i;
float v = (*ptr0++ + *ptr1-- + *ptr2-- + *ptr3++ + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8;
float vr = (*ptrr0++ + *ptrr1-- + *ptrr2-- + *ptrr3++ + *ptrr4 + *ptrr5 + *ptrr6 + *ptrr7) / 8;
*outptr++ = (v + vr) * 0.5f * 255.f + 0.5f;
}
}
}
}
else
{
for (int q = 0; q < 3; q++)
{
const ncnn::Mat out_padded_0 = out_padded[0].channel(q);
const ncnn::Mat out_padded_1 = out_padded[1].channel(q);
const ncnn::Mat out_padded_2 = out_padded[2].channel(q);
const ncnn::Mat out_padded_3 = out_padded[3].channel(q);
const ncnn::Mat out_padded_4 = out_padded[4].channel(q);
const ncnn::Mat out_padded_5 = out_padded[5].channel(q);
const ncnn::Mat out_padded_6 = out_padded[6].channel(q);
const ncnn::Mat out_padded_7 = out_padded[7].channel(q);
float* outptr = out.channel(q);
for (int i = 0; i < h; i++)
{
const float* ptr0 = out_padded_0.row(i);
const float* ptr1 = out_padded_1.row(i) + w_padded - 1;
const float* ptr2 = out_padded_2.row(h_padded - 1 - i) + w_padded - 1;
const float* ptr3 = out_padded_3.row(h_padded - 1 - i);
for (int j = 0; j < w; j++)
{
const float* ptr4 = out_padded_4.row(j) + i;
const float* ptr5 = out_padded_5.row(j) + h_padded - 1 - i;
const float* ptr6 = out_padded_6.row(w_padded - 1 - j) + h_padded - 1 - i;
const float* ptr7 = out_padded_7.row(w_padded - 1 - j) + i;
float v = (*ptr0++ + *ptr1-- + *ptr2-- + *ptr3++ + *ptr4 + *ptr5 + *ptr6 + *ptr7) / 8;
*outptr++ = v * 255.f + 0.5f;
}
}
}
}
}
else
{
// preproc and border padding
ncnn::Mat in0_padded;
ncnn::Mat in1_padded;
ncnn::Mat timestep_padded;
{
in0_padded.create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in0_padded.channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in0.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
in1_padded.create(w_padded, h_padded, 3);
for (int q = 0; q < 3; q++)
{
float* outptr = in1_padded.channel(q);
int i = 0;
for (; i < h; i++)
{
const float* ptr = in1.channel(q).row(i);
int j = 0;
for (; j < w; j++)
{
*outptr++ = *ptr++ * (1 / 255.f);
}
for (; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
for (; i < h_padded; i++)
{
for (int j = 0; j < w_padded; j++)
{
*outptr++ = 0.f;
}
}
}
}
{
timestep_padded.create(w_padded, h_padded, 1);
timestep_padded.fill(timestep);
}
ncnn::Mat out_padded;
ncnn::Mat out_padded_reversed;
if (tta_temporal_mode)
{
ncnn::Mat timestep_padded_reversed;
{
timestep_padded_reversed.create(w_padded, h_padded, 1);
timestep_padded_reversed.fill(1.f - timestep);
}
ncnn::Mat flow[4];
ncnn::Mat flow_reversed[4];
for (int fi = 0; fi < 4; fi++)
{
{
// flownet flow mask
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded);
ex.input("in1", in1_padded);
ex.input("in2", timestep_padded);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow[2]);
case 2: ex.input("flow1", flow[1]);
case 1: ex.input("flow0", flow[0]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow[fi]);
}
}
}
{
// flownet flow mask reversed
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in1_padded);
ex.input("in1", in0_padded);
ex.input("in2", timestep_padded_reversed);
// intentional fall through
switch (fi)
{
case 3: ex.input("flow2", flow_reversed[2]);
case 2: ex.input("flow1", flow_reversed[1]);
case 1: ex.input("flow0", flow_reversed[0]);
default:
{
char tmp[16];
sprintf(tmp, "flow%d", fi);
ex.extract(tmp, flow_reversed[fi]);
}
}
}
// merge flow and flow_reversed
{
float* flow_x = flow[fi].channel(0);
float* flow_y = flow[fi].channel(1);
float* flow_z = flow[fi].channel(2);
float* flow_w = flow[fi].channel(3);
float* flow_m = flow[fi].channel(4);
float* flow_reversed_x = flow_reversed[fi].channel(0);
float* flow_reversed_y = flow_reversed[fi].channel(1);
float* flow_reversed_z = flow_reversed[fi].channel(2);
float* flow_reversed_w = flow_reversed[fi].channel(3);
float* flow_reversed_m = flow_reversed[fi].channel(4);
for (int i = 0; i < flow[fi].h; i++)
{
for (int j = 0; j < flow[fi].w; j++)
{
float x = (*flow_x + *flow_reversed_z) * 0.5f;
float y = (*flow_y + *flow_reversed_w) * 0.5f;
float z = (*flow_z + *flow_reversed_x) * 0.5f;
float w = (*flow_w + *flow_reversed_y) * 0.5f;
float m = (*flow_m - *flow_reversed_m) * 0.5f;
*flow_x++ = x;
*flow_y++ = y;
*flow_z++ = z;
*flow_w++ = w;
*flow_m++ = m;
*flow_reversed_x++ = z;
*flow_reversed_y++ = w;
*flow_reversed_z++ = x;
*flow_reversed_w++ = y;
*flow_reversed_m++ = -m;
}
}
}
}
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded);
ex.input("in1", in1_padded);
ex.input("in2", timestep_padded);
ex.input("flow0", flow[0]);
ex.input("flow1", flow[1]);
ex.input("flow2", flow[2]);
ex.input("flow3", flow[3]);
ex.extract("out0", out_padded);
}
{
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in1_padded);
ex.input("in1", in0_padded);
ex.input("in2", timestep_padded_reversed);
ex.input("flow0", flow_reversed[0]);
ex.input("flow1", flow_reversed[1]);
ex.input("flow2", flow_reversed[2]);
ex.input("flow3", flow_reversed[3]);
ex.extract("out0", out_padded_reversed);
}
}
else
{
// flownet
ncnn::Extractor ex = flownet.create_extractor();
ex.input("in0", in0_padded);
ex.input("in1", in1_padded);
ex.input("in2", timestep_padded);
ex.extract("out0", out_padded);
}
// cut padding and postproc
out.create(w, h, 3);
if (tta_temporal_mode)
{
for (int q = 0; q < 3; q++)
{
float* outptr = out.channel(q);
const float* ptr = out_padded.channel(q);
const float* ptr1 = out_padded_reversed.channel(q);
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
*outptr++ = (*ptr++ + *ptr1++) * 0.5f * 255.f + 0.5f;
}
}
}
}
else
{
for (int q = 0; q < 3; q++)
{
float* outptr = out.channel(q);
const float* ptr = out_padded.channel(q);
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
*outptr++ = *ptr++ * 255.f + 0.5f;
}
}
}
}
}
// download
{
#if _WIN32
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB2BGR);
#else
out.to_pixels((unsigned char*)outimage.data, ncnn::Mat::PIXEL_RGB);
#endif
}
return 0;
}