diff --git a/README.md b/README.md index 04cdd03..545d011 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ Usage: rife-ncnn-vulkan -0 infile -1 infile1 -o outfile [options]... -m model-path rife model path (default=rife-HD) -g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu + -x enable tta mode -f pattern-format output image filename pattern format (%08d.jpg/png/webp, default=ext/%08d.png) ``` @@ -122,7 +123,6 @@ cmake --build . -j 4 ### TODO -* test-time sptial augmentation aka TTA-s * test-time temporal augmentation aka TTA-t ### Model @@ -141,13 +141,21 @@ cmake --build . -j 4 ![origin0](images/0.png) ![origin1](images/1.png) -### Interpolate with rife rife-HD model +### Interpolate with rife rife-anime model ```shell -rife-ncnn-vulkan.exe -m models/rife-HD -0 0.png -1 1.png -o out.png +rife-ncnn-vulkan.exe -m models/rife-anime -0 0.png -1 1.png -o out.png ``` -![cain](images/out.png) +![rife](images/out.png) + +### Interpolate with rife rife-anime model + TTA-s + +```shell +rife-ncnn-vulkan.exe -m models/rife-anime -x -0 0.png -1 1.png -o out.png +``` + +![rife](images/outx.png) ## Original RIFE Project diff --git a/images/out.png b/images/out.png index ce471a8..9756c8d 100644 Binary files a/images/out.png and b/images/out.png differ diff --git a/images/outx.png b/images/outx.png new file mode 100644 index 0000000..41f9066 Binary files /dev/null and b/images/outx.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fda96e2..a8ebd87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -46,7 +46,7 @@ set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) include(CheckIPOSupported) check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output) if(ipo_supported) - set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else() message(WARNING "IPO is not supported: ${ipo_supported_output}") endif() @@ -221,6 +221,9 @@ endif() rife_add_shader(rife_preproc.comp) rife_add_shader(rife_postproc.comp) +rife_add_shader(rife_preproc_tta.comp) +rife_add_shader(rife_postproc_tta.comp) +rife_add_shader(rife_flow_tta_avg.comp) rife_add_shader(warp.comp) rife_add_shader(warp_pack4.comp) rife_add_shader(warp_pack8.comp) diff --git a/src/main.cpp b/src/main.cpp index 1e38733..f8716d3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -112,6 +112,7 @@ static void print_usage() fprintf(stderr, " -m model-path rife model path (default=rife-HD)\n"); fprintf(stderr, " -g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n"); fprintf(stderr, " -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n"); + fprintf(stdout, " -x enable tta mode\n"); fprintf(stderr, " -f pattern-format output image filename pattern format (%%08d.jpg/png/webp, default=ext/%%08d.png)\n"); } @@ -447,12 +448,13 @@ int main(int argc, char** argv) std::vector jobs_proc; int jobs_save = 2; int verbose = 0; + int tta_mode = 0; path_t pattern_format = PATHSTR("%08d.png"); #if _WIN32 setlocale(LC_ALL, ""); wchar_t opt; - while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vh")) != (wchar_t)-1) + while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vxh")) != (wchar_t)-1) { switch (opt) { @@ -484,6 +486,9 @@ int main(int argc, char** argv) case L'v': verbose = 1; break; + case L'x': + tta_mode = 1; + break; case L'h': default: print_usage(); @@ -492,7 +497,7 @@ int main(int argc, char** argv) } #else // _WIN32 int opt; - while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vh")) != -1) + while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vxh")) != -1) { switch (opt) { @@ -524,6 +529,9 @@ int main(int argc, char** argv) case 'v': verbose = 1; break; + case 'x': + tta_mode = 1; + break; case 'h': default: print_usage(); @@ -728,7 +736,7 @@ int main(int argc, char** argv) int total_jobs_proc = 0; for (int i=0; iload(modeldir); } diff --git a/src/ncnn b/src/ncnn index 124d2c3..1a81be6 160000 --- a/src/ncnn +++ b/src/ncnn @@ -1 +1 @@ -Subproject commit 124d2c3d854cabe8c39dc13993b36dc4efd13713 +Subproject commit 1a81be6259c032c42271b7d666cb4a2494e54a50 diff --git a/src/rife.cpp b/src/rife.cpp index acc3f23..fbd29ec 100644 --- a/src/rife.cpp +++ b/src/rife.cpp @@ -8,16 +8,21 @@ #include "rife_preproc.comp.hex.h" #include "rife_postproc.comp.hex.h" +#include "rife_preproc_tta.comp.hex.h" +#include "rife_postproc_tta.comp.hex.h" +#include "rife_flow_tta_avg.comp.hex.h" #include "rife_ops.h" DEFINE_LAYER_CREATOR(Warp) -RIFE::RIFE(int gpuid) +RIFE::RIFE(int gpuid, bool _tta_mode) { vkdev = ncnn::get_gpu_device(gpuid); rife_preproc = 0; rife_postproc = 0; + rife_flow_tta_avg = 0; + tta_mode = _tta_mode; } RIFE::~RIFE() @@ -26,6 +31,7 @@ RIFE::~RIFE() { delete rife_preproc; delete rife_postproc; + delete rife_flow_tta_avg; } } @@ -124,7 +130,10 @@ int RIFE::load(const std::string& modeldir) ncnn::MutexLockGuard guard(lock); if (spirv.empty()) { - compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv); + if (tta_mode) + compile_spirv_module(rife_preproc_tta_comp_data, sizeof(rife_preproc_tta_comp_data), opt, spirv); + else + compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv); } } @@ -140,7 +149,10 @@ int RIFE::load(const std::string& modeldir) ncnn::MutexLockGuard guard(lock); if (spirv.empty()) { - compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv); + if (tta_mode) + compile_spirv_module(rife_postproc_tta_comp_data, sizeof(rife_postproc_tta_comp_data), opt, spirv); + else + compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv); } } @@ -150,6 +162,25 @@ int RIFE::load(const std::string& modeldir) } } + if (tta_mode) + { + static std::vector spirv; + static ncnn::Mutex lock; + { + ncnn::MutexLockGuard guard(lock); + if (spirv.empty()) + { + compile_spirv_module(rife_flow_tta_avg_comp_data, sizeof(rife_flow_tta_avg_comp_data), opt, spirv); + } + } + + std::vector specializations(0); + + rife_flow_tta_avg = new ncnn::Pipeline(vkdev); + rife_flow_tta_avg->set_optimal_local_size_xyz(8, 8, 1); + rife_flow_tta_avg->create(spirv.data(), spirv.size() * 4, specializations); + } + return 0; } @@ -217,148 +248,370 @@ int RIFE::process(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float ti cmd.record_clone(in1, in1_gpu, opt); } - // preproc - ncnn::VkMat in0_gpu_padded; - ncnn::VkMat in1_gpu_padded; - { - in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); - - std::vector bindings(2); - bindings[0] = in0_gpu; - bindings[1] = in0_gpu_padded; - - std::vector constants(6); - constants[0].i = in0_gpu.w; - constants[1].i = in0_gpu.h; - constants[2].i = in0_gpu.cstep; - constants[3].i = in0_gpu_padded.w; - constants[4].i = in0_gpu_padded.h; - constants[5].i = in0_gpu_padded.cstep; - - cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded); - } - { - in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); - - std::vector bindings(2); - bindings[0] = in1_gpu; - bindings[1] = in1_gpu_padded; - - std::vector constants(6); - constants[0].i = in1_gpu.w; - constants[1].i = in1_gpu.h; - constants[2].i = in1_gpu.cstep; - constants[3].i = in1_gpu_padded.w; - constants[4].i = in1_gpu_padded.h; - constants[5].i = in1_gpu_padded.cstep; - - cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded); - } - - // flownet - ncnn::VkMat flow; - { - ncnn::Extractor ex = flownet.create_extractor(); - ex.set_blob_vkallocator(blob_vkallocator); - ex.set_workspace_vkallocator(blob_vkallocator); - ex.set_staging_vkallocator(staging_vkallocator); - - ex.input("input0", in0_gpu_padded); - ex.input("input1", in1_gpu_padded); - ex.extract("flow", flow, cmd); - } - - // contextnet - ncnn::VkMat ctx0[4]; - ncnn::VkMat ctx1[4]; - { - ncnn::Extractor ex = contextnet.create_extractor(); - ex.set_blob_vkallocator(blob_vkallocator); - ex.set_workspace_vkallocator(blob_vkallocator); - ex.set_staging_vkallocator(staging_vkallocator); - - ex.input("input.1", in0_gpu_padded); - ex.input("flow.0", flow); - ex.extract("f1", ctx0[0], cmd); - ex.extract("f2", ctx0[1], cmd); - ex.extract("f3", ctx0[2], cmd); - ex.extract("f4", ctx0[3], cmd); - } - { - ncnn::Extractor ex = contextnet.create_extractor(); - ex.set_blob_vkallocator(blob_vkallocator); - ex.set_workspace_vkallocator(blob_vkallocator); - ex.set_staging_vkallocator(staging_vkallocator); - - ex.input("input.1", in1_gpu_padded); - ex.input("flow.1", flow); - ex.extract("f1", ctx1[0], cmd); - ex.extract("f2", ctx1[1], cmd); - ex.extract("f3", ctx1[2], cmd); - ex.extract("f4", ctx1[3], cmd); - } - - // fusionnet - ncnn::VkMat out_gpu_padded; - { - ncnn::Extractor ex = fusionnet.create_extractor(); - ex.set_blob_vkallocator(blob_vkallocator); - ex.set_workspace_vkallocator(blob_vkallocator); - ex.set_staging_vkallocator(staging_vkallocator); - - ex.input("img0", in0_gpu_padded); - ex.input("img1", in1_gpu_padded); - ex.input("flow", flow); - ex.input("3", ctx0[0]); - ex.input("4", ctx0[1]); - ex.input("5", ctx0[2]); - ex.input("6", ctx0[3]); - ex.input("7", ctx1[0]); - ex.input("8", ctx1[1]); - ex.input("9", ctx1[2]); - ex.input("10", ctx1[3]); - - // save some memory - in0_gpu.release(); - in1_gpu.release(); - flow.release(); - ctx0[0].release(); - ctx0[1].release(); - ctx0[2].release(); - ctx0[3].release(); - ctx1[0].release(); - ctx1[1].release(); - ctx1[2].release(); - ctx1[3].release(); - - ex.extract("output", out_gpu_padded, cmd); - } - ncnn::VkMat out_gpu; - if (opt.use_fp16_storage && opt.use_int8_storage) + + if (tta_mode) { - out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator); + // preproc + ncnn::VkMat in0_gpu_padded[8]; + ncnn::VkMat in1_gpu_padded[8]; + { + in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + + std::vector bindings(9); + bindings[0] = in0_gpu; + bindings[1] = in0_gpu_padded[0]; + bindings[2] = in0_gpu_padded[1]; + bindings[3] = in0_gpu_padded[2]; + bindings[4] = in0_gpu_padded[3]; + bindings[5] = in0_gpu_padded[4]; + bindings[6] = in0_gpu_padded[5]; + bindings[7] = in0_gpu_padded[6]; + bindings[8] = in0_gpu_padded[7]; + + std::vector constants(6); + constants[0].i = in0_gpu.w; + constants[1].i = in0_gpu.h; + constants[2].i = in0_gpu.cstep; + constants[3].i = in0_gpu_padded[0].w; + constants[4].i = in0_gpu_padded[0].h; + constants[5].i = in0_gpu_padded[0].cstep; + + cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]); + } + { + in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + + std::vector bindings(9); + bindings[0] = in1_gpu; + bindings[1] = in1_gpu_padded[0]; + bindings[2] = in1_gpu_padded[1]; + bindings[3] = in1_gpu_padded[2]; + bindings[4] = in1_gpu_padded[3]; + bindings[5] = in1_gpu_padded[4]; + bindings[6] = in1_gpu_padded[5]; + bindings[7] = in1_gpu_padded[6]; + bindings[8] = in1_gpu_padded[7]; + + std::vector constants(6); + constants[0].i = in1_gpu.w; + constants[1].i = in1_gpu.h; + constants[2].i = in1_gpu.cstep; + constants[3].i = in1_gpu_padded[0].w; + constants[4].i = in1_gpu_padded[0].h; + constants[5].i = in1_gpu_padded[0].cstep; + + cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]); + } + + ncnn::VkMat flow[8]; + for (int ti = 0; ti < 8; ti++) + { + // flownet + { + ncnn::Extractor ex = flownet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input0", in0_gpu_padded[ti]); + ex.input("input1", in1_gpu_padded[ti]); + ex.extract("flow", flow[ti], cmd); + } + } + + // avg flow + { + std::vector bindings(8); + bindings[0] = flow[0]; + bindings[1] = flow[1]; + bindings[2] = flow[2]; + bindings[3] = flow[3]; + bindings[4] = flow[4]; + bindings[5] = flow[5]; + bindings[6] = flow[6]; + bindings[7] = flow[7]; + + std::vector constants(3); + constants[0].i = flow[0].w; + constants[1].i = flow[0].h; + constants[2].i = flow[0].cstep; + + ncnn::VkMat dispatcher; + dispatcher.w = flow[0].w; + dispatcher.h = flow[0].h; + dispatcher.c = 1; + cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher); + } + + ncnn::VkMat out_gpu_padded[8]; + for (int ti = 0; ti < 8; ti++) + { + // contextnet + ncnn::VkMat ctx0[4]; + ncnn::VkMat ctx1[4]; + { + ncnn::Extractor ex = contextnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input.1", in0_gpu_padded[ti]); + ex.input("flow.0", flow[ti]); + ex.extract("f1", ctx0[0], cmd); + ex.extract("f2", ctx0[1], cmd); + ex.extract("f3", ctx0[2], cmd); + ex.extract("f4", ctx0[3], cmd); + } + { + ncnn::Extractor ex = contextnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input.1", in1_gpu_padded[ti]); + ex.input("flow.1", flow[ti]); + ex.extract("f1", ctx1[0], cmd); + ex.extract("f2", ctx1[1], cmd); + ex.extract("f3", ctx1[2], cmd); + ex.extract("f4", ctx1[3], cmd); + } + + // fusionnet + { + ncnn::Extractor ex = fusionnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("img0", in0_gpu_padded[ti]); + ex.input("img1", in1_gpu_padded[ti]); + ex.input("flow", flow[ti]); + ex.input("3", ctx0[0]); + ex.input("4", ctx0[1]); + ex.input("5", ctx0[2]); + ex.input("6", ctx0[3]); + ex.input("7", ctx1[0]); + ex.input("8", ctx1[1]); + ex.input("9", ctx1[2]); + ex.input("10", ctx1[3]); + + // save some memory + if (ti == 0) + { + in0_gpu.release(); + in1_gpu.release(); + } + else + { + in0_gpu_padded[ti - 1].release(); + in1_gpu_padded[ti - 1].release(); + flow[ti - 1].release(); + } + ctx0[0].release(); + ctx0[1].release(); + ctx0[2].release(); + ctx0[3].release(); + ctx1[0].release(); + ctx1[1].release(); + ctx1[2].release(); + ctx1[3].release(); + + ex.extract("output", out_gpu_padded[ti], cmd); + } + } + + if (opt.use_fp16_storage && opt.use_int8_storage) + { + out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator); + } + else + { + out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator); + } + + // postproc + { + std::vector bindings(9); + bindings[0] = out_gpu_padded[0]; + bindings[1] = out_gpu_padded[1]; + bindings[2] = out_gpu_padded[2]; + bindings[3] = out_gpu_padded[3]; + bindings[4] = out_gpu_padded[4]; + bindings[5] = out_gpu_padded[5]; + bindings[6] = out_gpu_padded[6]; + bindings[7] = out_gpu_padded[7]; + bindings[8] = out_gpu; + + std::vector constants(6); + constants[0].i = out_gpu_padded[0].w; + constants[1].i = out_gpu_padded[0].h; + constants[2].i = out_gpu_padded[0].cstep; + constants[3].i = out_gpu.w; + constants[4].i = out_gpu.h; + constants[5].i = out_gpu.cstep; + + cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu); + } } else { - out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator); - } + // preproc + ncnn::VkMat in0_gpu_padded; + ncnn::VkMat in1_gpu_padded; + { + in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); - // postproc - { - std::vector bindings(2); - bindings[0] = out_gpu_padded; - bindings[1] = out_gpu; + std::vector bindings(2); + bindings[0] = in0_gpu; + bindings[1] = in0_gpu_padded; - std::vector constants(6); - constants[0].i = out_gpu_padded.w; - constants[1].i = out_gpu_padded.h; - constants[2].i = out_gpu_padded.cstep; - constants[3].i = out_gpu.w; - constants[4].i = out_gpu.h; - constants[5].i = out_gpu.cstep; + std::vector constants(6); + constants[0].i = in0_gpu.w; + constants[1].i = in0_gpu.h; + constants[2].i = in0_gpu.cstep; + constants[3].i = in0_gpu_padded.w; + constants[4].i = in0_gpu_padded.h; + constants[5].i = in0_gpu_padded.cstep; - cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu); + cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded); + } + { + in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator); + + std::vector bindings(2); + bindings[0] = in1_gpu; + bindings[1] = in1_gpu_padded; + + std::vector constants(6); + constants[0].i = in1_gpu.w; + constants[1].i = in1_gpu.h; + constants[2].i = in1_gpu.cstep; + constants[3].i = in1_gpu_padded.w; + constants[4].i = in1_gpu_padded.h; + constants[5].i = in1_gpu_padded.cstep; + + cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded); + } + + // flownet + ncnn::VkMat flow; + { + ncnn::Extractor ex = flownet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input0", in0_gpu_padded); + ex.input("input1", in1_gpu_padded); + ex.extract("flow", flow, cmd); + } + + // contextnet + ncnn::VkMat ctx0[4]; + ncnn::VkMat ctx1[4]; + { + ncnn::Extractor ex = contextnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input.1", in0_gpu_padded); + ex.input("flow.0", flow); + ex.extract("f1", ctx0[0], cmd); + ex.extract("f2", ctx0[1], cmd); + ex.extract("f3", ctx0[2], cmd); + ex.extract("f4", ctx0[3], cmd); + } + { + ncnn::Extractor ex = contextnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("input.1", in1_gpu_padded); + ex.input("flow.1", flow); + ex.extract("f1", ctx1[0], cmd); + ex.extract("f2", ctx1[1], cmd); + ex.extract("f3", ctx1[2], cmd); + ex.extract("f4", ctx1[3], cmd); + } + + // fusionnet + ncnn::VkMat out_gpu_padded; + { + ncnn::Extractor ex = fusionnet.create_extractor(); + ex.set_blob_vkallocator(blob_vkallocator); + ex.set_workspace_vkallocator(blob_vkallocator); + ex.set_staging_vkallocator(staging_vkallocator); + + ex.input("img0", in0_gpu_padded); + ex.input("img1", in1_gpu_padded); + ex.input("flow", flow); + ex.input("3", ctx0[0]); + ex.input("4", ctx0[1]); + ex.input("5", ctx0[2]); + ex.input("6", ctx0[3]); + ex.input("7", ctx1[0]); + ex.input("8", ctx1[1]); + ex.input("9", ctx1[2]); + ex.input("10", ctx1[3]); + + // save some memory + in0_gpu.release(); + in1_gpu.release(); + flow.release(); + ctx0[0].release(); + ctx0[1].release(); + ctx0[2].release(); + ctx0[3].release(); + ctx1[0].release(); + ctx1[1].release(); + ctx1[2].release(); + ctx1[3].release(); + + ex.extract("output", out_gpu_padded, cmd); + } + + if (opt.use_fp16_storage && opt.use_int8_storage) + { + out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator); + } + else + { + out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator); + } + + // postproc + { + std::vector bindings(2); + bindings[0] = out_gpu_padded; + bindings[1] = out_gpu; + + std::vector constants(6); + constants[0].i = out_gpu_padded.w; + constants[1].i = out_gpu_padded.h; + constants[2].i = out_gpu_padded.cstep; + constants[3].i = out_gpu.w; + constants[4].i = out_gpu.h; + constants[5].i = out_gpu.cstep; + + cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu); + } } // download diff --git a/src/rife.h b/src/rife.h index fc2fe0f..16c61e0 100644 --- a/src/rife.h +++ b/src/rife.h @@ -11,7 +11,7 @@ class RIFE { public: - RIFE(int gpuid); + RIFE(int gpuid, bool tta_mode = false); ~RIFE(); #if _WIN32 @@ -29,6 +29,8 @@ private: ncnn::Net fusionnet; ncnn::Pipeline* rife_preproc; ncnn::Pipeline* rife_postproc; + ncnn::Pipeline* rife_flow_tta_avg; + bool tta_mode; }; #endif // RIFE_H diff --git a/src/rife_flow_tta_avg.comp b/src/rife_flow_tta_avg.comp new file mode 100644 index 0000000..bdc8a6c --- /dev/null +++ b/src/rife_flow_tta_avg.comp @@ -0,0 +1,72 @@ +// rife implemented with ncnn library + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif + +layout (binding = 0) buffer flow_blob0 { sfp flow_blob0_data[]; }; +layout (binding = 1) buffer flow_blob1 { sfp flow_blob1_data[]; }; +layout (binding = 2) buffer flow_blob2 { sfp flow_blob2_data[]; }; +layout (binding = 3) buffer flow_blob3 { sfp flow_blob3_data[]; }; +layout (binding = 4) buffer flow_blob4 { sfp flow_blob4_data[]; }; +layout (binding = 5) buffer flow_blob5 { sfp flow_blob5_data[]; }; +layout (binding = 6) buffer flow_blob6 { sfp flow_blob6_data[]; }; +layout (binding = 7) buffer flow_blob7 { sfp flow_blob7_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.w || gy >= p.h || gz >= 1) + return; + + float x0 = float(flow_blob0_data[gy * p.w + gx]); + float x1 = float(flow_blob1_data[gy * p.w + (p.w - 1 - gx)]); + float x2 = float(flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)]); + float x3 = float(flow_blob3_data[(p.h - 1 - gy) * p.w + gx]); + float x4 = float(flow_blob4_data[gx * p.h + gy]); + float x5 = float(flow_blob5_data[gx * p.h + (p.h - 1 - gy)]); + float x6 = float(flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)]); + float x7 = float(flow_blob7_data[(p.w - 1 - gx) * p.h + gy]); + + float y0 = float(flow_blob0_data[p.cstep + gy * p.w + gx]); + float y1 = float(flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)]); + float y2 = float(flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]); + float y3 = float(flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx]); + float y4 = float(flow_blob4_data[p.cstep + gx * p.h + gy]); + float y5 = float(flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)]); + float y6 = float(flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]); + float y7 = float(flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy]); + + float x = (x0 + -x1 + -x2 + x3 + y4 + y5 + -y6 + -y7) * 0.125f; + float y = (y0 + y1 + -y2 + -y3 + x4 + -x5 + -x6 + x7) * 0.125f; + + flow_blob0_data[gy * p.w + gx] = sfp(x); + flow_blob1_data[gy * p.w + (p.w - 1 - gx)] = sfp(-x); + flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-x); + flow_blob3_data[(p.h - 1 - gy) * p.w + gx] = sfp(x); + flow_blob4_data[gx * p.h + gy] = sfp(y); + flow_blob5_data[gx * p.h + (p.h - 1 - gy)] = sfp(-y); + flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-y); + flow_blob7_data[(p.w - 1 - gx) * p.h + gy] = sfp(y); + + flow_blob0_data[p.cstep + gy * p.w + gx] = sfp(y); + flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)] = sfp(y); + flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-y); + flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx] = sfp(-y); + flow_blob4_data[p.cstep + gx * p.h + gy] = sfp(x); + flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)] = sfp(x); + flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-x); + flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy] = sfp(-x); +} diff --git a/src/rife_postproc_tta.comp b/src/rife_postproc_tta.comp new file mode 100644 index 0000000..b050561 --- /dev/null +++ b/src/rife_postproc_tta.comp @@ -0,0 +1,81 @@ +// rife implemented with ncnn library + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; }; +layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; }; +layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; }; +layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; }; +layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; }; +layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; }; +layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; }; +layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; }; +#if NCNN_int8_storage +layout (binding = 8) writeonly buffer top_blob { uint8_t top_blob_data[]; }; +#else +layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= 3) + return; + + int gzi = gz * p.cstep; + + float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]); + float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]); + float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]); + float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]); + float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]); + float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]); + float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]); + float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]); + + float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f; + + const float denorm_val = 255.f; + const float clip_eps = 0.5f; + + v = v * denorm_val + clip_eps; + +#if NCNN_int8_storage + int v_offset = gy * p.outw + gx; + + uint v32 = clamp(uint(floor(v)), 0, 255); + + if (bgr == 0) + top_blob_data[v_offset * 3 + gz] = uint8_t(v32); + else + top_blob_data[v_offset * 3 + 2 - gz] = uint8_t(v32); +#else + int v_offset = gz * p.outcstep + gy * p.outw + gx; + + top_blob_data[v_offset] = v; +#endif +} diff --git a/src/rife_preproc_tta.comp b/src/rife_preproc_tta.comp new file mode 100644 index 0000000..9d229f6 --- /dev/null +++ b/src/rife_preproc_tta.comp @@ -0,0 +1,93 @@ +// rife implemented with ncnn library + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif + +#if NCNN_int8_storage +#extension GL_EXT_shader_8bit_storage: require +#endif + +layout (constant_id = 0) const int bgr = 0; + +#if NCNN_int8_storage +layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; }; +layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; }; +layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; }; +layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; }; +layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; }; +layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; }; +layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; }; +layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int cstep; + + int outw; + int outh; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= 3) + return; + + if (gx < 0 || gx >= p.w || gy < 0 || gy >= p.h) + { + int gzi = gz * p.outcstep; + + top_blob0_data[gzi + gy * p.outw + gx] = sfp(0.f); + top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(0.f); + top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(0.f); + top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(0.f); + top_blob4_data[gzi + gx * p.outh + gy] = sfp(0.f); + top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(0.f); + top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(0.f); + top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(0.f); + return; + } + +#if NCNN_int8_storage + int v_offset = gy * p.w + gx; + + float v; + + if (bgr == 0) + v = float(uint(bottom_blob_data[v_offset * 3 + gz])); + else + v = float(uint(bottom_blob_data[v_offset * 3 + 2 - gz])); +#else + int v_offset = gz * p.cstep + gy * p.w + gx; + + float v = bottom_blob_data[v_offset]; +#endif + + const float norm_val = 1 / 255.f; + + v = v * norm_val; + + int gzi = gz * p.outcstep; + + top_blob0_data[gzi + gy * p.outw + gx] = sfp(v); + top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v); + top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v); + top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v); + top_blob4_data[gzi + gx * p.outh + gy] = sfp(v); + top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v); + top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v); + top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v); +}