update ncnn, spatial tta option, enable lto

2021-01-03 11:20:56 +08:00 · 2021-01-03 11:20:56 +08:00 · 8d0af9f79b
parent 5c362017e5
commit 8d0af9f79b
11 changed files with 668 additions and 148 deletions
--- a/README.md
+++ b/README.md
@ -74,6 +74,7 @@ Usage: rife-ncnn-vulkan -0 infile -1 infile1 -o outfile [options]...
  -m model-path        rife model path (default=rife-HD)
  -g gpu-id            gpu device to use (default=auto) can be 0,1,2 for multi-gpu
  -j load:proc:save    thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu
+  -x                   enable tta mode
  -f pattern-format    output image filename pattern format (%08d.jpg/png/webp, default=ext/%08d.png)
 ```

@ -122,7 +123,6 @@ cmake --build . -j 4

 ### TODO

-* test-time sptial augmentation aka TTA-s
 * test-time temporal augmentation aka TTA-t

 ### Model
@ -141,13 +141,21 @@ cmake --build . -j 4
 ![origin0](images/0.png)
 ![origin1](images/1.png)

-### Interpolate with rife rife-HD model
+### Interpolate with rife rife-anime model

 ```shell
-rife-ncnn-vulkan.exe -m models/rife-HD -0 0.png -1 1.png -o out.png
+rife-ncnn-vulkan.exe -m models/rife-anime -0 0.png -1 1.png -o out.png
 ```

-![cain](images/out.png)
+![rife](images/out.png)
+
+### Interpolate with rife rife-anime model + TTA-s
+
+```shell
+rife-ncnn-vulkan.exe -m models/rife-anime -x -0 0.png -1 1.png -o out.png
+```
+
+![rife](images/outx.png)

 ## Original RIFE Project

--- a/images/out.png
+++ b/images/out.png
--- a/images/outx.png
+++ b/images/outx.png
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -46,7 +46,7 @@ set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
 include(CheckIPOSupported)
 check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
 if(ipo_supported)
-    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
+    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else()
    message(WARNING "IPO is not supported: ${ipo_supported_output}")
 endif()
@ -221,6 +221,9 @@ endif()

 rife_add_shader(rife_preproc.comp)
 rife_add_shader(rife_postproc.comp)
+rife_add_shader(rife_preproc_tta.comp)
+rife_add_shader(rife_postproc_tta.comp)
+rife_add_shader(rife_flow_tta_avg.comp)
 rife_add_shader(warp.comp)
 rife_add_shader(warp_pack4.comp)
 rife_add_shader(warp_pack8.comp)
--- a/src/main.cpp
+++ b/src/main.cpp
@ -112,6 +112,7 @@ static void print_usage()
    fprintf(stderr, "  -m model-path        rife model path (default=rife-HD)\n");
    fprintf(stderr, "  -g gpu-id            gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n");
    fprintf(stderr, "  -j load:proc:save    thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
+    fprintf(stdout, "  -x                   enable tta mode\n");
    fprintf(stderr, "  -f pattern-format    output image filename pattern format (%%08d.jpg/png/webp, default=ext/%%08d.png)\n");
 }

@ -447,12 +448,13 @@ int main(int argc, char** argv)
    std::vector<int> jobs_proc;
    int jobs_save = 2;
    int verbose = 0;
+    int tta_mode = 0;
    path_t pattern_format = PATHSTR("%08d.png");

 #if _WIN32
    setlocale(LC_ALL, "");
    wchar_t opt;
-    while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vh")) != (wchar_t)-1)
+    while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vxh")) != (wchar_t)-1)
    {
        switch (opt)
        {
@ -484,6 +486,9 @@ int main(int argc, char** argv)
        case L'v':
            verbose = 1;
            break;
+        case L'x':
+            tta_mode = 1;
+            break;
        case L'h':
        default:
            print_usage();
@ -492,7 +497,7 @@ int main(int argc, char** argv)
    }
 #else // _WIN32
    int opt;
-    while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vh")) != -1)
+    while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vxh")) != -1)
    {
        switch (opt)
        {
@ -524,6 +529,9 @@ int main(int argc, char** argv)
        case 'v':
            verbose = 1;
            break;
+        case 'x':
+            tta_mode = 1;
+            break;
        case 'h':
        default:
            print_usage();
@ -728,7 +736,7 @@ int main(int argc, char** argv)
    int total_jobs_proc = 0;
    for (int i=0; i<use_gpu_count; i++)
    {
-        int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count;
+        int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count();
        jobs_proc[i] = std::min(jobs_proc[i], gpu_queue_count);
        total_jobs_proc += jobs_proc[i];
    }
@ -738,7 +746,7 @@ int main(int argc, char** argv)

        for (int i=0; i<use_gpu_count; i++)
        {
-            rife[i] = new RIFE(gpuid[i]);
+            rife[i] = new RIFE(gpuid[i], tta_mode);

            rife[i]->load(modeldir);
        }
--- a/src/ncnn
+++ b/src/ncnn
@ -1 +1 @@
-Subproject commit 124d2c3d854cabe8c39dc13993b36dc4efd13713
+Subproject commit 1a81be6259c032c42271b7d666cb4a2494e54a50
--- a/src/rife.cpp
+++ b/src/rife.cpp
@ -8,16 +8,21 @@

 #include "rife_preproc.comp.hex.h"
 #include "rife_postproc.comp.hex.h"
+#include "rife_preproc_tta.comp.hex.h"
+#include "rife_postproc_tta.comp.hex.h"
+#include "rife_flow_tta_avg.comp.hex.h"

 #include "rife_ops.h"

 DEFINE_LAYER_CREATOR(Warp)

-RIFE::RIFE(int gpuid)
+RIFE::RIFE(int gpuid, bool _tta_mode)
 {
    vkdev = ncnn::get_gpu_device(gpuid);
    rife_preproc = 0;
    rife_postproc = 0;
+    rife_flow_tta_avg = 0;
+    tta_mode = _tta_mode;
 }

 RIFE::~RIFE()
@ -26,6 +31,7 @@ RIFE::~RIFE()
    {
        delete rife_preproc;
        delete rife_postproc;
+        delete rife_flow_tta_avg;
    }
 }

@ -124,7 +130,10 @@ int RIFE::load(const std::string& modeldir)
                ncnn::MutexLockGuard guard(lock);
                if (spirv.empty())
                {
-                    compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
+                    if (tta_mode)
+                        compile_spirv_module(rife_preproc_tta_comp_data, sizeof(rife_preproc_tta_comp_data), opt, spirv);
+                    else
+                        compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
                }
            }

@ -140,7 +149,10 @@ int RIFE::load(const std::string& modeldir)
                ncnn::MutexLockGuard guard(lock);
                if (spirv.empty())
                {
-                    compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
+                    if (tta_mode)
+                        compile_spirv_module(rife_postproc_tta_comp_data, sizeof(rife_postproc_tta_comp_data), opt, spirv);
+                    else
+                        compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
                }
            }

@ -150,6 +162,25 @@ int RIFE::load(const std::string& modeldir)
        }
    }

+    if (tta_mode)
+    {
+        static std::vector<uint32_t> spirv;
+        static ncnn::Mutex lock;
+        {
+            ncnn::MutexLockGuard guard(lock);
+            if (spirv.empty())
+            {
+                compile_spirv_module(rife_flow_tta_avg_comp_data, sizeof(rife_flow_tta_avg_comp_data), opt, spirv);
+            }
+        }
+
+        std::vector<ncnn::vk_specialization_type> specializations(0);
+
+        rife_flow_tta_avg = new ncnn::Pipeline(vkdev);
+        rife_flow_tta_avg->set_optimal_local_size_xyz(8, 8, 1);
+        rife_flow_tta_avg->create(spirv.data(), spirv.size() * 4, specializations);
+    }
+
    return 0;
 }

@ -217,148 +248,370 @@ int RIFE::process(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float ti
        cmd.record_clone(in1, in1_gpu, opt);
    }

-    // preproc
-    ncnn::VkMat in0_gpu_padded;
-    ncnn::VkMat in1_gpu_padded;
-    {
-        in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
-
-        std::vector<ncnn::VkMat> bindings(2);
-        bindings[0] = in0_gpu;
-        bindings[1] = in0_gpu_padded;
-
-        std::vector<ncnn::vk_constant_type> constants(6);
-        constants[0].i = in0_gpu.w;
-        constants[1].i = in0_gpu.h;
-        constants[2].i = in0_gpu.cstep;
-        constants[3].i = in0_gpu_padded.w;
-        constants[4].i = in0_gpu_padded.h;
-        constants[5].i = in0_gpu_padded.cstep;
-
-        cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
-    }
-    {
-        in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
-
-        std::vector<ncnn::VkMat> bindings(2);
-        bindings[0] = in1_gpu;
-        bindings[1] = in1_gpu_padded;
-
-        std::vector<ncnn::vk_constant_type> constants(6);
-        constants[0].i = in1_gpu.w;
-        constants[1].i = in1_gpu.h;
-        constants[2].i = in1_gpu.cstep;
-        constants[3].i = in1_gpu_padded.w;
-        constants[4].i = in1_gpu_padded.h;
-        constants[5].i = in1_gpu_padded.cstep;
-
-        cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
-    }
-
-    // flownet
-    ncnn::VkMat flow;
-    {
-        ncnn::Extractor ex = flownet.create_extractor();
-        ex.set_blob_vkallocator(blob_vkallocator);
-        ex.set_workspace_vkallocator(blob_vkallocator);
-        ex.set_staging_vkallocator(staging_vkallocator);
-
-        ex.input("input0", in0_gpu_padded);
-        ex.input("input1", in1_gpu_padded);
-        ex.extract("flow", flow, cmd);
-    }
-
-    // contextnet
-    ncnn::VkMat ctx0[4];
-    ncnn::VkMat ctx1[4];
-    {
-        ncnn::Extractor ex = contextnet.create_extractor();
-        ex.set_blob_vkallocator(blob_vkallocator);
-        ex.set_workspace_vkallocator(blob_vkallocator);
-        ex.set_staging_vkallocator(staging_vkallocator);
-
-        ex.input("input.1", in0_gpu_padded);
-        ex.input("flow.0", flow);
-        ex.extract("f1", ctx0[0], cmd);
-        ex.extract("f2", ctx0[1], cmd);
-        ex.extract("f3", ctx0[2], cmd);
-        ex.extract("f4", ctx0[3], cmd);
-    }
-    {
-        ncnn::Extractor ex = contextnet.create_extractor();
-        ex.set_blob_vkallocator(blob_vkallocator);
-        ex.set_workspace_vkallocator(blob_vkallocator);
-        ex.set_staging_vkallocator(staging_vkallocator);
-
-        ex.input("input.1", in1_gpu_padded);
-        ex.input("flow.1", flow);
-        ex.extract("f1", ctx1[0], cmd);
-        ex.extract("f2", ctx1[1], cmd);
-        ex.extract("f3", ctx1[2], cmd);
-        ex.extract("f4", ctx1[3], cmd);
-    }
-
-    // fusionnet
-    ncnn::VkMat out_gpu_padded;
-    {
-        ncnn::Extractor ex = fusionnet.create_extractor();
-        ex.set_blob_vkallocator(blob_vkallocator);
-        ex.set_workspace_vkallocator(blob_vkallocator);
-        ex.set_staging_vkallocator(staging_vkallocator);
-
-        ex.input("img0", in0_gpu_padded);
-        ex.input("img1", in1_gpu_padded);
-        ex.input("flow", flow);
-        ex.input("3", ctx0[0]);
-        ex.input("4", ctx0[1]);
-        ex.input("5", ctx0[2]);
-        ex.input("6", ctx0[3]);
-        ex.input("7", ctx1[0]);
-        ex.input("8", ctx1[1]);
-        ex.input("9", ctx1[2]);
-        ex.input("10", ctx1[3]);
-
-        // save some memory
-        in0_gpu.release();
-        in1_gpu.release();
-        flow.release();
-        ctx0[0].release();
-        ctx0[1].release();
-        ctx0[2].release();
-        ctx0[3].release();
-        ctx1[0].release();
-        ctx1[1].release();
-        ctx1[2].release();
-        ctx1[3].release();
-
-        ex.extract("output", out_gpu_padded, cmd);
-    }
-
    ncnn::VkMat out_gpu;
-    if (opt.use_fp16_storage && opt.use_int8_storage)
+
+    if (tta_mode)
    {
-        out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
+        // preproc
+        ncnn::VkMat in0_gpu_padded[8];
+        ncnn::VkMat in1_gpu_padded[8];
+        {
+            in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+
+            std::vector<ncnn::VkMat> bindings(9);
+            bindings[0] = in0_gpu;
+            bindings[1] = in0_gpu_padded[0];
+            bindings[2] = in0_gpu_padded[1];
+            bindings[3] = in0_gpu_padded[2];
+            bindings[4] = in0_gpu_padded[3];
+            bindings[5] = in0_gpu_padded[4];
+            bindings[6] = in0_gpu_padded[5];
+            bindings[7] = in0_gpu_padded[6];
+            bindings[8] = in0_gpu_padded[7];
+
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = in0_gpu.w;
+            constants[1].i = in0_gpu.h;
+            constants[2].i = in0_gpu.cstep;
+            constants[3].i = in0_gpu_padded[0].w;
+            constants[4].i = in0_gpu_padded[0].h;
+            constants[5].i = in0_gpu_padded[0].cstep;
+
+            cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]);
+        }
+        {
+            in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+            in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+
+            std::vector<ncnn::VkMat> bindings(9);
+            bindings[0] = in1_gpu;
+            bindings[1] = in1_gpu_padded[0];
+            bindings[2] = in1_gpu_padded[1];
+            bindings[3] = in1_gpu_padded[2];
+            bindings[4] = in1_gpu_padded[3];
+            bindings[5] = in1_gpu_padded[4];
+            bindings[6] = in1_gpu_padded[5];
+            bindings[7] = in1_gpu_padded[6];
+            bindings[8] = in1_gpu_padded[7];
+
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = in1_gpu.w;
+            constants[1].i = in1_gpu.h;
+            constants[2].i = in1_gpu.cstep;
+            constants[3].i = in1_gpu_padded[0].w;
+            constants[4].i = in1_gpu_padded[0].h;
+            constants[5].i = in1_gpu_padded[0].cstep;
+
+            cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]);
+        }
+
+        ncnn::VkMat flow[8];
+        for (int ti = 0; ti < 8; ti++)
+        {
+            // flownet
+            {
+                ncnn::Extractor ex = flownet.create_extractor();
+                ex.set_blob_vkallocator(blob_vkallocator);
+                ex.set_workspace_vkallocator(blob_vkallocator);
+                ex.set_staging_vkallocator(staging_vkallocator);
+
+                ex.input("input0", in0_gpu_padded[ti]);
+                ex.input("input1", in1_gpu_padded[ti]);
+                ex.extract("flow", flow[ti], cmd);
+            }
+        }
+
+        // avg flow
+        {
+            std::vector<ncnn::VkMat> bindings(8);
+            bindings[0] = flow[0];
+            bindings[1] = flow[1];
+            bindings[2] = flow[2];
+            bindings[3] = flow[3];
+            bindings[4] = flow[4];
+            bindings[5] = flow[5];
+            bindings[6] = flow[6];
+            bindings[7] = flow[7];
+
+            std::vector<ncnn::vk_constant_type> constants(3);
+            constants[0].i = flow[0].w;
+            constants[1].i = flow[0].h;
+            constants[2].i = flow[0].cstep;
+
+            ncnn::VkMat dispatcher;
+            dispatcher.w = flow[0].w;
+            dispatcher.h = flow[0].h;
+            dispatcher.c = 1;
+            cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
+        }
+
+        ncnn::VkMat out_gpu_padded[8];
+        for (int ti = 0; ti < 8; ti++)
+        {
+            // contextnet
+            ncnn::VkMat ctx0[4];
+            ncnn::VkMat ctx1[4];
+            {
+                ncnn::Extractor ex = contextnet.create_extractor();
+                ex.set_blob_vkallocator(blob_vkallocator);
+                ex.set_workspace_vkallocator(blob_vkallocator);
+                ex.set_staging_vkallocator(staging_vkallocator);
+
+                ex.input("input.1", in0_gpu_padded[ti]);
+                ex.input("flow.0", flow[ti]);
+                ex.extract("f1", ctx0[0], cmd);
+                ex.extract("f2", ctx0[1], cmd);
+                ex.extract("f3", ctx0[2], cmd);
+                ex.extract("f4", ctx0[3], cmd);
+            }
+            {
+                ncnn::Extractor ex = contextnet.create_extractor();
+                ex.set_blob_vkallocator(blob_vkallocator);
+                ex.set_workspace_vkallocator(blob_vkallocator);
+                ex.set_staging_vkallocator(staging_vkallocator);
+
+                ex.input("input.1", in1_gpu_padded[ti]);
+                ex.input("flow.1", flow[ti]);
+                ex.extract("f1", ctx1[0], cmd);
+                ex.extract("f2", ctx1[1], cmd);
+                ex.extract("f3", ctx1[2], cmd);
+                ex.extract("f4", ctx1[3], cmd);
+            }
+
+            // fusionnet
+            {
+                ncnn::Extractor ex = fusionnet.create_extractor();
+                ex.set_blob_vkallocator(blob_vkallocator);
+                ex.set_workspace_vkallocator(blob_vkallocator);
+                ex.set_staging_vkallocator(staging_vkallocator);
+
+                ex.input("img0", in0_gpu_padded[ti]);
+                ex.input("img1", in1_gpu_padded[ti]);
+                ex.input("flow", flow[ti]);
+                ex.input("3", ctx0[0]);
+                ex.input("4", ctx0[1]);
+                ex.input("5", ctx0[2]);
+                ex.input("6", ctx0[3]);
+                ex.input("7", ctx1[0]);
+                ex.input("8", ctx1[1]);
+                ex.input("9", ctx1[2]);
+                ex.input("10", ctx1[3]);
+
+                // save some memory
+                if (ti == 0)
+                {
+                    in0_gpu.release();
+                    in1_gpu.release();
+                }
+                else
+                {
+                    in0_gpu_padded[ti - 1].release();
+                    in1_gpu_padded[ti - 1].release();
+                    flow[ti - 1].release();
+                }
+                ctx0[0].release();
+                ctx0[1].release();
+                ctx0[2].release();
+                ctx0[3].release();
+                ctx1[0].release();
+                ctx1[1].release();
+                ctx1[2].release();
+                ctx1[3].release();
+
+                ex.extract("output", out_gpu_padded[ti], cmd);
+            }
+        }
+
+        if (opt.use_fp16_storage && opt.use_int8_storage)
+        {
+            out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
+        }
+        else
+        {
+            out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
+        }
+
+        // postproc
+        {
+            std::vector<ncnn::VkMat> bindings(9);
+            bindings[0] = out_gpu_padded[0];
+            bindings[1] = out_gpu_padded[1];
+            bindings[2] = out_gpu_padded[2];
+            bindings[3] = out_gpu_padded[3];
+            bindings[4] = out_gpu_padded[4];
+            bindings[5] = out_gpu_padded[5];
+            bindings[6] = out_gpu_padded[6];
+            bindings[7] = out_gpu_padded[7];
+            bindings[8] = out_gpu;
+
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = out_gpu_padded[0].w;
+            constants[1].i = out_gpu_padded[0].h;
+            constants[2].i = out_gpu_padded[0].cstep;
+            constants[3].i = out_gpu.w;
+            constants[4].i = out_gpu.h;
+            constants[5].i = out_gpu.cstep;
+
+            cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
+        }
    }
    else
    {
-        out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
-    }
+        // preproc
+        ncnn::VkMat in0_gpu_padded;
+        ncnn::VkMat in1_gpu_padded;
+        {
+            in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);

-    // postproc
-    {
-        std::vector<ncnn::VkMat> bindings(2);
-        bindings[0] = out_gpu_padded;
-        bindings[1] = out_gpu;
+            std::vector<ncnn::VkMat> bindings(2);
+            bindings[0] = in0_gpu;
+            bindings[1] = in0_gpu_padded;

-        std::vector<ncnn::vk_constant_type> constants(6);
-        constants[0].i = out_gpu_padded.w;
-        constants[1].i = out_gpu_padded.h;
-        constants[2].i = out_gpu_padded.cstep;
-        constants[3].i = out_gpu.w;
-        constants[4].i = out_gpu.h;
-        constants[5].i = out_gpu.cstep;
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = in0_gpu.w;
+            constants[1].i = in0_gpu.h;
+            constants[2].i = in0_gpu.cstep;
+            constants[3].i = in0_gpu_padded.w;
+            constants[4].i = in0_gpu_padded.h;
+            constants[5].i = in0_gpu_padded.cstep;

-        cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
+            cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
+        }
+        {
+            in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
+
+            std::vector<ncnn::VkMat> bindings(2);
+            bindings[0] = in1_gpu;
+            bindings[1] = in1_gpu_padded;
+
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = in1_gpu.w;
+            constants[1].i = in1_gpu.h;
+            constants[2].i = in1_gpu.cstep;
+            constants[3].i = in1_gpu_padded.w;
+            constants[4].i = in1_gpu_padded.h;
+            constants[5].i = in1_gpu_padded.cstep;
+
+            cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
+        }
+
+        // flownet
+        ncnn::VkMat flow;
+        {
+            ncnn::Extractor ex = flownet.create_extractor();
+            ex.set_blob_vkallocator(blob_vkallocator);
+            ex.set_workspace_vkallocator(blob_vkallocator);
+            ex.set_staging_vkallocator(staging_vkallocator);
+
+            ex.input("input0", in0_gpu_padded);
+            ex.input("input1", in1_gpu_padded);
+            ex.extract("flow", flow, cmd);
+        }
+
+        // contextnet
+        ncnn::VkMat ctx0[4];
+        ncnn::VkMat ctx1[4];
+        {
+            ncnn::Extractor ex = contextnet.create_extractor();
+            ex.set_blob_vkallocator(blob_vkallocator);
+            ex.set_workspace_vkallocator(blob_vkallocator);
+            ex.set_staging_vkallocator(staging_vkallocator);
+
+            ex.input("input.1", in0_gpu_padded);
+            ex.input("flow.0", flow);
+            ex.extract("f1", ctx0[0], cmd);
+            ex.extract("f2", ctx0[1], cmd);
+            ex.extract("f3", ctx0[2], cmd);
+            ex.extract("f4", ctx0[3], cmd);
+        }
+        {
+            ncnn::Extractor ex = contextnet.create_extractor();
+            ex.set_blob_vkallocator(blob_vkallocator);
+            ex.set_workspace_vkallocator(blob_vkallocator);
+            ex.set_staging_vkallocator(staging_vkallocator);
+
+            ex.input("input.1", in1_gpu_padded);
+            ex.input("flow.1", flow);
+            ex.extract("f1", ctx1[0], cmd);
+            ex.extract("f2", ctx1[1], cmd);
+            ex.extract("f3", ctx1[2], cmd);
+            ex.extract("f4", ctx1[3], cmd);
+        }
+
+        // fusionnet
+        ncnn::VkMat out_gpu_padded;
+        {
+            ncnn::Extractor ex = fusionnet.create_extractor();
+            ex.set_blob_vkallocator(blob_vkallocator);
+            ex.set_workspace_vkallocator(blob_vkallocator);
+            ex.set_staging_vkallocator(staging_vkallocator);
+
+            ex.input("img0", in0_gpu_padded);
+            ex.input("img1", in1_gpu_padded);
+            ex.input("flow", flow);
+            ex.input("3", ctx0[0]);
+            ex.input("4", ctx0[1]);
+            ex.input("5", ctx0[2]);
+            ex.input("6", ctx0[3]);
+            ex.input("7", ctx1[0]);
+            ex.input("8", ctx1[1]);
+            ex.input("9", ctx1[2]);
+            ex.input("10", ctx1[3]);
+
+            // save some memory
+            in0_gpu.release();
+            in1_gpu.release();
+            flow.release();
+            ctx0[0].release();
+            ctx0[1].release();
+            ctx0[2].release();
+            ctx0[3].release();
+            ctx1[0].release();
+            ctx1[1].release();
+            ctx1[2].release();
+            ctx1[3].release();
+
+            ex.extract("output", out_gpu_padded, cmd);
+        }
+
+        if (opt.use_fp16_storage && opt.use_int8_storage)
+        {
+            out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
+        }
+        else
+        {
+            out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
+        }
+
+        // postproc
+        {
+            std::vector<ncnn::VkMat> bindings(2);
+            bindings[0] = out_gpu_padded;
+            bindings[1] = out_gpu;
+
+            std::vector<ncnn::vk_constant_type> constants(6);
+            constants[0].i = out_gpu_padded.w;
+            constants[1].i = out_gpu_padded.h;
+            constants[2].i = out_gpu_padded.cstep;
+            constants[3].i = out_gpu.w;
+            constants[4].i = out_gpu.h;
+            constants[5].i = out_gpu.cstep;
+
+            cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
+        }
    }

    // download
--- a/src/rife.h
+++ b/src/rife.h
@ -11,7 +11,7 @@
 class RIFE
 {
 public:
-    RIFE(int gpuid);
+    RIFE(int gpuid, bool tta_mode = false);
    ~RIFE();

 #if _WIN32
@ -29,6 +29,8 @@ private:
    ncnn::Net fusionnet;
    ncnn::Pipeline* rife_preproc;
    ncnn::Pipeline* rife_postproc;
+    ncnn::Pipeline* rife_flow_tta_avg;
+    bool tta_mode;
 };

 #endif // RIFE_H
--- a/src/rife_flow_tta_avg.comp
+++ b/src/rife_flow_tta_avg.comp
@ -0,0 +1,72 @@
+// rife implemented with ncnn library
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+
+layout (binding = 0) buffer flow_blob0 { sfp flow_blob0_data[]; };
+layout (binding = 1) buffer flow_blob1 { sfp flow_blob1_data[]; };
+layout (binding = 2) buffer flow_blob2 { sfp flow_blob2_data[]; };
+layout (binding = 3) buffer flow_blob3 { sfp flow_blob3_data[]; };
+layout (binding = 4) buffer flow_blob4 { sfp flow_blob4_data[]; };
+layout (binding = 5) buffer flow_blob5 { sfp flow_blob5_data[]; };
+layout (binding = 6) buffer flow_blob6 { sfp flow_blob6_data[]; };
+layout (binding = 7) buffer flow_blob7 { sfp flow_blob7_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.w || gy >= p.h || gz >= 1)
+        return;
+
+    float x0 = float(flow_blob0_data[gy * p.w + gx]);
+    float x1 = float(flow_blob1_data[gy * p.w + (p.w - 1 - gx)]);
+    float x2 = float(flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
+    float x3 = float(flow_blob3_data[(p.h - 1 - gy) * p.w + gx]);
+    float x4 = float(flow_blob4_data[gx * p.h + gy]);
+    float x5 = float(flow_blob5_data[gx * p.h + (p.h - 1 - gy)]);
+    float x6 = float(flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
+    float x7 = float(flow_blob7_data[(p.w - 1 - gx) * p.h + gy]);
+
+    float y0 = float(flow_blob0_data[p.cstep + gy * p.w + gx]);
+    float y1 = float(flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)]);
+    float y2 = float(flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
+    float y3 = float(flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx]);
+    float y4 = float(flow_blob4_data[p.cstep + gx * p.h + gy]);
+    float y5 = float(flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)]);
+    float y6 = float(flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
+    float y7 = float(flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy]);
+
+    float x = (x0 + -x1 + -x2 + x3 + y4 + y5 + -y6 + -y7) * 0.125f;
+    float y = (y0 + y1 + -y2 + -y3 + x4 + -x5 + -x6 + x7) * 0.125f;
+
+    flow_blob0_data[gy * p.w + gx] = sfp(x);
+    flow_blob1_data[gy * p.w + (p.w - 1 - gx)] = sfp(-x);
+    flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-x);
+    flow_blob3_data[(p.h - 1 - gy) * p.w + gx] = sfp(x);
+    flow_blob4_data[gx * p.h + gy] = sfp(y);
+    flow_blob5_data[gx * p.h + (p.h - 1 - gy)] = sfp(-y);
+    flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-y);
+    flow_blob7_data[(p.w - 1 - gx) * p.h + gy] = sfp(y);
+
+    flow_blob0_data[p.cstep + gy * p.w + gx] = sfp(y);
+    flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)] = sfp(y);
+    flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-y);
+    flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx] = sfp(-y);
+    flow_blob4_data[p.cstep + gx * p.h + gy] = sfp(x);
+    flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)] = sfp(x);
+    flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-x);
+    flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy] = sfp(-x);
+}
--- a/src/rife_postproc_tta.comp
+++ b/src/rife_postproc_tta.comp
@ -0,0 +1,81 @@
+// rife implemented with ncnn library
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
+layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
+layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
+layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; };
+layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; };
+layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; };
+layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; };
+layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; };
+#if NCNN_int8_storage
+layout (binding = 8) writeonly buffer top_blob { uint8_t top_blob_data[]; };
+#else
+layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= 3)
+        return;
+
+    int gzi = gz * p.cstep;
+
+    float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]);
+    float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]);
+    float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
+    float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]);
+    float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]);
+    float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]);
+    float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
+    float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]);
+
+    float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
+
+    const float denorm_val = 255.f;
+    const float clip_eps = 0.5f;
+
+    v = v * denorm_val + clip_eps;
+
+#if NCNN_int8_storage
+    int v_offset = gy * p.outw + gx;
+
+    uint v32 = clamp(uint(floor(v)), 0, 255);
+
+    if (bgr == 0)
+        top_blob_data[v_offset * 3 + gz] = uint8_t(v32);
+    else
+        top_blob_data[v_offset * 3 + 2 - gz] = uint8_t(v32);
+#else
+    int v_offset = gz * p.outcstep + gy * p.outw + gx;
+
+    top_blob_data[v_offset] = v;
+#endif
+}
--- a/src/rife_preproc_tta.comp
+++ b/src/rife_preproc_tta.comp
@ -0,0 +1,93 @@
+// rife implemented with ncnn library
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+
+#if NCNN_int8_storage
+#extension GL_EXT_shader_8bit_storage: require
+#endif
+
+layout (constant_id = 0) const int bgr = 0;
+
+#if NCNN_int8_storage
+layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; };
+layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; };
+layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; };
+layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; };
+layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; };
+layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; };
+layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; };
+layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= 3)
+        return;
+
+    if (gx < 0 || gx >= p.w || gy < 0 || gy >= p.h)
+    {
+        int gzi = gz * p.outcstep;
+
+        top_blob0_data[gzi + gy * p.outw + gx] = sfp(0.f);
+        top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
+        top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
+        top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(0.f);
+        top_blob4_data[gzi + gx * p.outh + gy] = sfp(0.f);
+        top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
+        top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
+        top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(0.f);
+        return;
+    }
+
+#if NCNN_int8_storage
+    int v_offset = gy * p.w + gx;
+
+    float v;
+
+    if (bgr == 0)
+        v = float(uint(bottom_blob_data[v_offset * 3 + gz]));
+    else
+        v = float(uint(bottom_blob_data[v_offset * 3 + 2 - gz]));
+#else
+    int v_offset = gz * p.cstep + gy * p.w + gx;
+
+    float v = bottom_blob_data[v_offset];
+#endif
+
+    const float norm_val = 1 / 255.f;
+
+    v = v * norm_val;
+
+    int gzi = gz * p.outcstep;
+
+    top_blob0_data[gzi + gy * p.outw + gx] = sfp(v);
+    top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v);
+    top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v);
+    top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v);
+    top_blob4_data[gzi + gx * p.outh + gy] = sfp(v);
+    top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v);
+    top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v);
+    top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v);
+}