update ncnn, spatial tta option, enable lto

This commit is contained in:
nihui 2021-01-03 11:20:56 +08:00
parent 5c362017e5
commit 8d0af9f79b
11 changed files with 668 additions and 148 deletions

View File

@ -74,6 +74,7 @@ Usage: rife-ncnn-vulkan -0 infile -1 infile1 -o outfile [options]...
-m model-path rife model path (default=rife-HD)
-g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu
-j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu
-x enable tta mode
-f pattern-format output image filename pattern format (%08d.jpg/png/webp, default=ext/%08d.png)
```
@ -122,7 +123,6 @@ cmake --build . -j 4
### TODO
* test-time sptial augmentation aka TTA-s
* test-time temporal augmentation aka TTA-t
### Model
@ -141,13 +141,21 @@ cmake --build . -j 4
![origin0](images/0.png)
![origin1](images/1.png)
### Interpolate with rife rife-HD model
### Interpolate with rife rife-anime model
```shell
rife-ncnn-vulkan.exe -m models/rife-HD -0 0.png -1 1.png -o out.png
rife-ncnn-vulkan.exe -m models/rife-anime -0 0.png -1 1.png -o out.png
```
![cain](images/out.png)
![rife](images/out.png)
### Interpolate with rife rife-anime model + TTA-s
```shell
rife-ncnn-vulkan.exe -m models/rife-anime -x -0 0.png -1 1.png -o out.png
```
![rife](images/outx.png)
## Original RIFE Project

Binary file not shown.

Before

Width:  |  Height:  |  Size: 232 KiB

After

Width:  |  Height:  |  Size: 309 KiB

BIN
images/outx.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

View File

@ -46,7 +46,7 @@ set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
include(CheckIPOSupported)
check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
if(ipo_supported)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else()
message(WARNING "IPO is not supported: ${ipo_supported_output}")
endif()
@ -221,6 +221,9 @@ endif()
rife_add_shader(rife_preproc.comp)
rife_add_shader(rife_postproc.comp)
rife_add_shader(rife_preproc_tta.comp)
rife_add_shader(rife_postproc_tta.comp)
rife_add_shader(rife_flow_tta_avg.comp)
rife_add_shader(warp.comp)
rife_add_shader(warp_pack4.comp)
rife_add_shader(warp_pack8.comp)

View File

@ -112,6 +112,7 @@ static void print_usage()
fprintf(stderr, " -m model-path rife model path (default=rife-HD)\n");
fprintf(stderr, " -g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n");
fprintf(stderr, " -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
fprintf(stdout, " -x enable tta mode\n");
fprintf(stderr, " -f pattern-format output image filename pattern format (%%08d.jpg/png/webp, default=ext/%%08d.png)\n");
}
@ -447,12 +448,13 @@ int main(int argc, char** argv)
std::vector<int> jobs_proc;
int jobs_save = 2;
int verbose = 0;
int tta_mode = 0;
path_t pattern_format = PATHSTR("%08d.png");
#if _WIN32
setlocale(LC_ALL, "");
wchar_t opt;
while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vh")) != (wchar_t)-1)
while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vxh")) != (wchar_t)-1)
{
switch (opt)
{
@ -484,6 +486,9 @@ int main(int argc, char** argv)
case L'v':
verbose = 1;
break;
case L'x':
tta_mode = 1;
break;
case L'h':
default:
print_usage();
@ -492,7 +497,7 @@ int main(int argc, char** argv)
}
#else // _WIN32
int opt;
while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vh")) != -1)
while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vxh")) != -1)
{
switch (opt)
{
@ -524,6 +529,9 @@ int main(int argc, char** argv)
case 'v':
verbose = 1;
break;
case 'x':
tta_mode = 1;
break;
case 'h':
default:
print_usage();
@ -728,7 +736,7 @@ int main(int argc, char** argv)
int total_jobs_proc = 0;
for (int i=0; i<use_gpu_count; i++)
{
int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count;
int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count();
jobs_proc[i] = std::min(jobs_proc[i], gpu_queue_count);
total_jobs_proc += jobs_proc[i];
}
@ -738,7 +746,7 @@ int main(int argc, char** argv)
for (int i=0; i<use_gpu_count; i++)
{
rife[i] = new RIFE(gpuid[i]);
rife[i] = new RIFE(gpuid[i], tta_mode);
rife[i]->load(modeldir);
}

@ -1 +1 @@
Subproject commit 124d2c3d854cabe8c39dc13993b36dc4efd13713
Subproject commit 1a81be6259c032c42271b7d666cb4a2494e54a50

View File

@ -8,16 +8,21 @@
#include "rife_preproc.comp.hex.h"
#include "rife_postproc.comp.hex.h"
#include "rife_preproc_tta.comp.hex.h"
#include "rife_postproc_tta.comp.hex.h"
#include "rife_flow_tta_avg.comp.hex.h"
#include "rife_ops.h"
DEFINE_LAYER_CREATOR(Warp)
RIFE::RIFE(int gpuid)
RIFE::RIFE(int gpuid, bool _tta_mode)
{
vkdev = ncnn::get_gpu_device(gpuid);
rife_preproc = 0;
rife_postproc = 0;
rife_flow_tta_avg = 0;
tta_mode = _tta_mode;
}
RIFE::~RIFE()
@ -26,6 +31,7 @@ RIFE::~RIFE()
{
delete rife_preproc;
delete rife_postproc;
delete rife_flow_tta_avg;
}
}
@ -124,7 +130,10 @@ int RIFE::load(const std::string& modeldir)
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
if (tta_mode)
compile_spirv_module(rife_preproc_tta_comp_data, sizeof(rife_preproc_tta_comp_data), opt, spirv);
else
compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
}
}
@ -140,7 +149,10 @@ int RIFE::load(const std::string& modeldir)
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
if (tta_mode)
compile_spirv_module(rife_postproc_tta_comp_data, sizeof(rife_postproc_tta_comp_data), opt, spirv);
else
compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
}
}
@ -150,6 +162,25 @@ int RIFE::load(const std::string& modeldir)
}
}
if (tta_mode)
{
static std::vector<uint32_t> spirv;
static ncnn::Mutex lock;
{
ncnn::MutexLockGuard guard(lock);
if (spirv.empty())
{
compile_spirv_module(rife_flow_tta_avg_comp_data, sizeof(rife_flow_tta_avg_comp_data), opt, spirv);
}
}
std::vector<ncnn::vk_specialization_type> specializations(0);
rife_flow_tta_avg = new ncnn::Pipeline(vkdev);
rife_flow_tta_avg->set_optimal_local_size_xyz(8, 8, 1);
rife_flow_tta_avg->create(spirv.data(), spirv.size() * 4, specializations);
}
return 0;
}
@ -217,148 +248,370 @@ int RIFE::process(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float ti
cmd.record_clone(in1, in1_gpu, opt);
}
// preproc
ncnn::VkMat in0_gpu_padded;
ncnn::VkMat in1_gpu_padded;
{
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded.w;
constants[4].i = in0_gpu_padded.h;
constants[5].i = in0_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
}
{
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded.w;
constants[4].i = in1_gpu_padded.h;
constants[5].i = in1_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
}
// flownet
ncnn::VkMat flow;
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input0", in0_gpu_padded);
ex.input("input1", in1_gpu_padded);
ex.extract("flow", flow, cmd);
}
// contextnet
ncnn::VkMat ctx0[4];
ncnn::VkMat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in0_gpu_padded);
ex.input("flow.0", flow);
ex.extract("f1", ctx0[0], cmd);
ex.extract("f2", ctx0[1], cmd);
ex.extract("f3", ctx0[2], cmd);
ex.extract("f4", ctx0[3], cmd);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in1_gpu_padded);
ex.input("flow.1", flow);
ex.extract("f1", ctx1[0], cmd);
ex.extract("f2", ctx1[1], cmd);
ex.extract("f3", ctx1[2], cmd);
ex.extract("f4", ctx1[3], cmd);
}
// fusionnet
ncnn::VkMat out_gpu_padded;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in0_gpu_padded);
ex.input("img1", in1_gpu_padded);
ex.input("flow", flow);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
// save some memory
in0_gpu.release();
in1_gpu.release();
flow.release();
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
ex.extract("output", out_gpu_padded, cmd);
}
ncnn::VkMat out_gpu;
if (opt.use_fp16_storage && opt.use_int8_storage)
if (tta_mode)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
// preproc
ncnn::VkMat in0_gpu_padded[8];
ncnn::VkMat in1_gpu_padded[8];
{
in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded[0];
bindings[2] = in0_gpu_padded[1];
bindings[3] = in0_gpu_padded[2];
bindings[4] = in0_gpu_padded[3];
bindings[5] = in0_gpu_padded[4];
bindings[6] = in0_gpu_padded[5];
bindings[7] = in0_gpu_padded[6];
bindings[8] = in0_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded[0].w;
constants[4].i = in0_gpu_padded[0].h;
constants[5].i = in0_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]);
}
{
in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded[0];
bindings[2] = in1_gpu_padded[1];
bindings[3] = in1_gpu_padded[2];
bindings[4] = in1_gpu_padded[3];
bindings[5] = in1_gpu_padded[4];
bindings[6] = in1_gpu_padded[5];
bindings[7] = in1_gpu_padded[6];
bindings[8] = in1_gpu_padded[7];
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded[0].w;
constants[4].i = in1_gpu_padded[0].h;
constants[5].i = in1_gpu_padded[0].cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]);
}
ncnn::VkMat flow[8];
for (int ti = 0; ti < 8; ti++)
{
// flownet
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input0", in0_gpu_padded[ti]);
ex.input("input1", in1_gpu_padded[ti]);
ex.extract("flow", flow[ti], cmd);
}
}
// avg flow
{
std::vector<ncnn::VkMat> bindings(8);
bindings[0] = flow[0];
bindings[1] = flow[1];
bindings[2] = flow[2];
bindings[3] = flow[3];
bindings[4] = flow[4];
bindings[5] = flow[5];
bindings[6] = flow[6];
bindings[7] = flow[7];
std::vector<ncnn::vk_constant_type> constants(3);
constants[0].i = flow[0].w;
constants[1].i = flow[0].h;
constants[2].i = flow[0].cstep;
ncnn::VkMat dispatcher;
dispatcher.w = flow[0].w;
dispatcher.h = flow[0].h;
dispatcher.c = 1;
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
}
ncnn::VkMat out_gpu_padded[8];
for (int ti = 0; ti < 8; ti++)
{
// contextnet
ncnn::VkMat ctx0[4];
ncnn::VkMat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in0_gpu_padded[ti]);
ex.input("flow.0", flow[ti]);
ex.extract("f1", ctx0[0], cmd);
ex.extract("f2", ctx0[1], cmd);
ex.extract("f3", ctx0[2], cmd);
ex.extract("f4", ctx0[3], cmd);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in1_gpu_padded[ti]);
ex.input("flow.1", flow[ti]);
ex.extract("f1", ctx1[0], cmd);
ex.extract("f2", ctx1[1], cmd);
ex.extract("f3", ctx1[2], cmd);
ex.extract("f4", ctx1[3], cmd);
}
// fusionnet
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in0_gpu_padded[ti]);
ex.input("img1", in1_gpu_padded[ti]);
ex.input("flow", flow[ti]);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
// save some memory
if (ti == 0)
{
in0_gpu.release();
in1_gpu.release();
}
else
{
in0_gpu_padded[ti - 1].release();
in1_gpu_padded[ti - 1].release();
flow[ti - 1].release();
}
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
ex.extract("output", out_gpu_padded[ti], cmd);
}
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(9);
bindings[0] = out_gpu_padded[0];
bindings[1] = out_gpu_padded[1];
bindings[2] = out_gpu_padded[2];
bindings[3] = out_gpu_padded[3];
bindings[4] = out_gpu_padded[4];
bindings[5] = out_gpu_padded[5];
bindings[6] = out_gpu_padded[6];
bindings[7] = out_gpu_padded[7];
bindings[8] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded[0].w;
constants[1].i = out_gpu_padded[0].h;
constants[2].i = out_gpu_padded[0].cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// preproc
ncnn::VkMat in0_gpu_padded;
ncnn::VkMat in1_gpu_padded;
{
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
// postproc
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu;
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in0_gpu;
bindings[1] = in0_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in0_gpu.w;
constants[1].i = in0_gpu.h;
constants[2].i = in0_gpu.cstep;
constants[3].i = in0_gpu_padded.w;
constants[4].i = in0_gpu_padded.h;
constants[5].i = in0_gpu_padded.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
}
{
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = in1_gpu;
bindings[1] = in1_gpu_padded;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = in1_gpu.w;
constants[1].i = in1_gpu.h;
constants[2].i = in1_gpu.cstep;
constants[3].i = in1_gpu_padded.w;
constants[4].i = in1_gpu_padded.h;
constants[5].i = in1_gpu_padded.cstep;
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
}
// flownet
ncnn::VkMat flow;
{
ncnn::Extractor ex = flownet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input0", in0_gpu_padded);
ex.input("input1", in1_gpu_padded);
ex.extract("flow", flow, cmd);
}
// contextnet
ncnn::VkMat ctx0[4];
ncnn::VkMat ctx1[4];
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in0_gpu_padded);
ex.input("flow.0", flow);
ex.extract("f1", ctx0[0], cmd);
ex.extract("f2", ctx0[1], cmd);
ex.extract("f3", ctx0[2], cmd);
ex.extract("f4", ctx0[3], cmd);
}
{
ncnn::Extractor ex = contextnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("input.1", in1_gpu_padded);
ex.input("flow.1", flow);
ex.extract("f1", ctx1[0], cmd);
ex.extract("f2", ctx1[1], cmd);
ex.extract("f3", ctx1[2], cmd);
ex.extract("f4", ctx1[3], cmd);
}
// fusionnet
ncnn::VkMat out_gpu_padded;
{
ncnn::Extractor ex = fusionnet.create_extractor();
ex.set_blob_vkallocator(blob_vkallocator);
ex.set_workspace_vkallocator(blob_vkallocator);
ex.set_staging_vkallocator(staging_vkallocator);
ex.input("img0", in0_gpu_padded);
ex.input("img1", in1_gpu_padded);
ex.input("flow", flow);
ex.input("3", ctx0[0]);
ex.input("4", ctx0[1]);
ex.input("5", ctx0[2]);
ex.input("6", ctx0[3]);
ex.input("7", ctx1[0]);
ex.input("8", ctx1[1]);
ex.input("9", ctx1[2]);
ex.input("10", ctx1[3]);
// save some memory
in0_gpu.release();
in1_gpu.release();
flow.release();
ctx0[0].release();
ctx0[1].release();
ctx0[2].release();
ctx0[3].release();
ctx1[0].release();
ctx1[1].release();
ctx1[2].release();
ctx1[3].release();
ex.extract("output", out_gpu_padded, cmd);
}
if (opt.use_fp16_storage && opt.use_int8_storage)
{
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
}
else
{
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
}
// postproc
{
std::vector<ncnn::VkMat> bindings(2);
bindings[0] = out_gpu_padded;
bindings[1] = out_gpu;
std::vector<ncnn::vk_constant_type> constants(6);
constants[0].i = out_gpu_padded.w;
constants[1].i = out_gpu_padded.h;
constants[2].i = out_gpu_padded.cstep;
constants[3].i = out_gpu.w;
constants[4].i = out_gpu.h;
constants[5].i = out_gpu.cstep;
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
}
}
// download

View File

@ -11,7 +11,7 @@
class RIFE
{
public:
RIFE(int gpuid);
RIFE(int gpuid, bool tta_mode = false);
~RIFE();
#if _WIN32
@ -29,6 +29,8 @@ private:
ncnn::Net fusionnet;
ncnn::Pipeline* rife_preproc;
ncnn::Pipeline* rife_postproc;
ncnn::Pipeline* rife_flow_tta_avg;
bool tta_mode;
};
#endif // RIFE_H

View File

@ -0,0 +1,72 @@
// rife implemented with ncnn library
#version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
layout (binding = 0) buffer flow_blob0 { sfp flow_blob0_data[]; };
layout (binding = 1) buffer flow_blob1 { sfp flow_blob1_data[]; };
layout (binding = 2) buffer flow_blob2 { sfp flow_blob2_data[]; };
layout (binding = 3) buffer flow_blob3 { sfp flow_blob3_data[]; };
layout (binding = 4) buffer flow_blob4 { sfp flow_blob4_data[]; };
layout (binding = 5) buffer flow_blob5 { sfp flow_blob5_data[]; };
layout (binding = 6) buffer flow_blob6 { sfp flow_blob6_data[]; };
layout (binding = 7) buffer flow_blob7 { sfp flow_blob7_data[]; };
layout (push_constant) uniform parameter
{
int w;
int h;
int cstep;
} p;
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= p.w || gy >= p.h || gz >= 1)
return;
float x0 = float(flow_blob0_data[gy * p.w + gx]);
float x1 = float(flow_blob1_data[gy * p.w + (p.w - 1 - gx)]);
float x2 = float(flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
float x3 = float(flow_blob3_data[(p.h - 1 - gy) * p.w + gx]);
float x4 = float(flow_blob4_data[gx * p.h + gy]);
float x5 = float(flow_blob5_data[gx * p.h + (p.h - 1 - gy)]);
float x6 = float(flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
float x7 = float(flow_blob7_data[(p.w - 1 - gx) * p.h + gy]);
float y0 = float(flow_blob0_data[p.cstep + gy * p.w + gx]);
float y1 = float(flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)]);
float y2 = float(flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
float y3 = float(flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx]);
float y4 = float(flow_blob4_data[p.cstep + gx * p.h + gy]);
float y5 = float(flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)]);
float y6 = float(flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
float y7 = float(flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy]);
float x = (x0 + -x1 + -x2 + x3 + y4 + y5 + -y6 + -y7) * 0.125f;
float y = (y0 + y1 + -y2 + -y3 + x4 + -x5 + -x6 + x7) * 0.125f;
flow_blob0_data[gy * p.w + gx] = sfp(x);
flow_blob1_data[gy * p.w + (p.w - 1 - gx)] = sfp(-x);
flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-x);
flow_blob3_data[(p.h - 1 - gy) * p.w + gx] = sfp(x);
flow_blob4_data[gx * p.h + gy] = sfp(y);
flow_blob5_data[gx * p.h + (p.h - 1 - gy)] = sfp(-y);
flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-y);
flow_blob7_data[(p.w - 1 - gx) * p.h + gy] = sfp(y);
flow_blob0_data[p.cstep + gy * p.w + gx] = sfp(y);
flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)] = sfp(y);
flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-y);
flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx] = sfp(-y);
flow_blob4_data[p.cstep + gx * p.h + gy] = sfp(x);
flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)] = sfp(x);
flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-x);
flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy] = sfp(-x);
}

View File

@ -0,0 +1,81 @@
// rife implemented with ncnn library
#version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_int8_storage
#extension GL_EXT_shader_8bit_storage: require
#endif
layout (constant_id = 0) const int bgr = 0;
layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; };
layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; };
layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; };
layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; };
layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; };
#if NCNN_int8_storage
layout (binding = 8) writeonly buffer top_blob { uint8_t top_blob_data[]; };
#else
layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; };
#endif
layout (push_constant) uniform parameter
{
int w;
int h;
int cstep;
int outw;
int outh;
int outcstep;
} p;
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= p.outw || gy >= p.outh || gz >= 3)
return;
int gzi = gz * p.cstep;
float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]);
float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]);
float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]);
float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]);
float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]);
float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]);
float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
const float denorm_val = 255.f;
const float clip_eps = 0.5f;
v = v * denorm_val + clip_eps;
#if NCNN_int8_storage
int v_offset = gy * p.outw + gx;
uint v32 = clamp(uint(floor(v)), 0, 255);
if (bgr == 0)
top_blob_data[v_offset * 3 + gz] = uint8_t(v32);
else
top_blob_data[v_offset * 3 + 2 - gz] = uint8_t(v32);
#else
int v_offset = gz * p.outcstep + gy * p.outw + gx;
top_blob_data[v_offset] = v;
#endif
}

93
src/rife_preproc_tta.comp Normal file
View File

@ -0,0 +1,93 @@
// rife implemented with ncnn library
#version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_int8_storage
#extension GL_EXT_shader_8bit_storage: require
#endif
layout (constant_id = 0) const int bgr = 0;
#if NCNN_int8_storage
layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
#else
layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
#endif
layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; };
layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; };
layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; };
layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; };
layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; };
layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; };
layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; };
layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; };
layout (push_constant) uniform parameter
{
int w;
int h;
int cstep;
int outw;
int outh;
int outcstep;
} p;
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= p.outw || gy >= p.outh || gz >= 3)
return;
if (gx < 0 || gx >= p.w || gy < 0 || gy >= p.h)
{
int gzi = gz * p.outcstep;
top_blob0_data[gzi + gy * p.outw + gx] = sfp(0.f);
top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(0.f);
top_blob4_data[gzi + gx * p.outh + gy] = sfp(0.f);
top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(0.f);
return;
}
#if NCNN_int8_storage
int v_offset = gy * p.w + gx;
float v;
if (bgr == 0)
v = float(uint(bottom_blob_data[v_offset * 3 + gz]));
else
v = float(uint(bottom_blob_data[v_offset * 3 + 2 - gz]));
#else
int v_offset = gz * p.cstep + gy * p.w + gx;
float v = bottom_blob_data[v_offset];
#endif
const float norm_val = 1 / 255.f;
v = v * norm_val;
int gzi = gz * p.outcstep;
top_blob0_data[gzi + gy * p.outw + gx] = sfp(v);
top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v);
top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v);
top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v);
top_blob4_data[gzi + gx * p.outh + gy] = sfp(v);
top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v);
top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v);
top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v);
}