update ncnn, spatial tta option, enable lto
This commit is contained in:
parent
5c362017e5
commit
8d0af9f79b
16
README.md
16
README.md
|
@ -74,6 +74,7 @@ Usage: rife-ncnn-vulkan -0 infile -1 infile1 -o outfile [options]...
|
|||
-m model-path rife model path (default=rife-HD)
|
||||
-g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu
|
||||
-j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu
|
||||
-x enable tta mode
|
||||
-f pattern-format output image filename pattern format (%08d.jpg/png/webp, default=ext/%08d.png)
|
||||
```
|
||||
|
||||
|
@ -122,7 +123,6 @@ cmake --build . -j 4
|
|||
|
||||
### TODO
|
||||
|
||||
* test-time sptial augmentation aka TTA-s
|
||||
* test-time temporal augmentation aka TTA-t
|
||||
|
||||
### Model
|
||||
|
@ -141,13 +141,21 @@ cmake --build . -j 4
|
|||
![origin0](images/0.png)
|
||||
![origin1](images/1.png)
|
||||
|
||||
### Interpolate with rife rife-HD model
|
||||
### Interpolate with rife rife-anime model
|
||||
|
||||
```shell
|
||||
rife-ncnn-vulkan.exe -m models/rife-HD -0 0.png -1 1.png -o out.png
|
||||
rife-ncnn-vulkan.exe -m models/rife-anime -0 0.png -1 1.png -o out.png
|
||||
```
|
||||
|
||||
![cain](images/out.png)
|
||||
![rife](images/out.png)
|
||||
|
||||
### Interpolate with rife rife-anime model + TTA-s
|
||||
|
||||
```shell
|
||||
rife-ncnn-vulkan.exe -m models/rife-anime -x -0 0.png -1 1.png -o out.png
|
||||
```
|
||||
|
||||
![rife](images/outx.png)
|
||||
|
||||
## Original RIFE Project
|
||||
|
||||
|
|
BIN
images/out.png
BIN
images/out.png
Binary file not shown.
Before Width: | Height: | Size: 232 KiB After Width: | Height: | Size: 309 KiB |
Binary file not shown.
After Width: | Height: | Size: 250 KiB |
|
@ -46,7 +46,7 @@ set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
|
|||
include(CheckIPOSupported)
|
||||
check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
|
||||
if(ipo_supported)
|
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
|
||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
||||
else()
|
||||
message(WARNING "IPO is not supported: ${ipo_supported_output}")
|
||||
endif()
|
||||
|
@ -221,6 +221,9 @@ endif()
|
|||
|
||||
rife_add_shader(rife_preproc.comp)
|
||||
rife_add_shader(rife_postproc.comp)
|
||||
rife_add_shader(rife_preproc_tta.comp)
|
||||
rife_add_shader(rife_postproc_tta.comp)
|
||||
rife_add_shader(rife_flow_tta_avg.comp)
|
||||
rife_add_shader(warp.comp)
|
||||
rife_add_shader(warp_pack4.comp)
|
||||
rife_add_shader(warp_pack8.comp)
|
||||
|
|
16
src/main.cpp
16
src/main.cpp
|
@ -112,6 +112,7 @@ static void print_usage()
|
|||
fprintf(stderr, " -m model-path rife model path (default=rife-HD)\n");
|
||||
fprintf(stderr, " -g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n");
|
||||
fprintf(stderr, " -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
|
||||
fprintf(stdout, " -x enable tta mode\n");
|
||||
fprintf(stderr, " -f pattern-format output image filename pattern format (%%08d.jpg/png/webp, default=ext/%%08d.png)\n");
|
||||
}
|
||||
|
||||
|
@ -447,12 +448,13 @@ int main(int argc, char** argv)
|
|||
std::vector<int> jobs_proc;
|
||||
int jobs_save = 2;
|
||||
int verbose = 0;
|
||||
int tta_mode = 0;
|
||||
path_t pattern_format = PATHSTR("%08d.png");
|
||||
|
||||
#if _WIN32
|
||||
setlocale(LC_ALL, "");
|
||||
wchar_t opt;
|
||||
while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vh")) != (wchar_t)-1)
|
||||
while ((opt = getopt(argc, argv, L"0:1:i:o:m:g:j:f:vxh")) != (wchar_t)-1)
|
||||
{
|
||||
switch (opt)
|
||||
{
|
||||
|
@ -484,6 +486,9 @@ int main(int argc, char** argv)
|
|||
case L'v':
|
||||
verbose = 1;
|
||||
break;
|
||||
case L'x':
|
||||
tta_mode = 1;
|
||||
break;
|
||||
case L'h':
|
||||
default:
|
||||
print_usage();
|
||||
|
@ -492,7 +497,7 @@ int main(int argc, char** argv)
|
|||
}
|
||||
#else // _WIN32
|
||||
int opt;
|
||||
while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vh")) != -1)
|
||||
while ((opt = getopt(argc, argv, "0:1:i:o:m:g:j:f:vxh")) != -1)
|
||||
{
|
||||
switch (opt)
|
||||
{
|
||||
|
@ -524,6 +529,9 @@ int main(int argc, char** argv)
|
|||
case 'v':
|
||||
verbose = 1;
|
||||
break;
|
||||
case 'x':
|
||||
tta_mode = 1;
|
||||
break;
|
||||
case 'h':
|
||||
default:
|
||||
print_usage();
|
||||
|
@ -728,7 +736,7 @@ int main(int argc, char** argv)
|
|||
int total_jobs_proc = 0;
|
||||
for (int i=0; i<use_gpu_count; i++)
|
||||
{
|
||||
int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count;
|
||||
int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count();
|
||||
jobs_proc[i] = std::min(jobs_proc[i], gpu_queue_count);
|
||||
total_jobs_proc += jobs_proc[i];
|
||||
}
|
||||
|
@ -738,7 +746,7 @@ int main(int argc, char** argv)
|
|||
|
||||
for (int i=0; i<use_gpu_count; i++)
|
||||
{
|
||||
rife[i] = new RIFE(gpuid[i]);
|
||||
rife[i] = new RIFE(gpuid[i], tta_mode);
|
||||
|
||||
rife[i]->load(modeldir);
|
||||
}
|
||||
|
|
2
src/ncnn
2
src/ncnn
|
@ -1 +1 @@
|
|||
Subproject commit 124d2c3d854cabe8c39dc13993b36dc4efd13713
|
||||
Subproject commit 1a81be6259c032c42271b7d666cb4a2494e54a50
|
527
src/rife.cpp
527
src/rife.cpp
|
@ -8,16 +8,21 @@
|
|||
|
||||
#include "rife_preproc.comp.hex.h"
|
||||
#include "rife_postproc.comp.hex.h"
|
||||
#include "rife_preproc_tta.comp.hex.h"
|
||||
#include "rife_postproc_tta.comp.hex.h"
|
||||
#include "rife_flow_tta_avg.comp.hex.h"
|
||||
|
||||
#include "rife_ops.h"
|
||||
|
||||
DEFINE_LAYER_CREATOR(Warp)
|
||||
|
||||
RIFE::RIFE(int gpuid)
|
||||
RIFE::RIFE(int gpuid, bool _tta_mode)
|
||||
{
|
||||
vkdev = ncnn::get_gpu_device(gpuid);
|
||||
rife_preproc = 0;
|
||||
rife_postproc = 0;
|
||||
rife_flow_tta_avg = 0;
|
||||
tta_mode = _tta_mode;
|
||||
}
|
||||
|
||||
RIFE::~RIFE()
|
||||
|
@ -26,6 +31,7 @@ RIFE::~RIFE()
|
|||
{
|
||||
delete rife_preproc;
|
||||
delete rife_postproc;
|
||||
delete rife_flow_tta_avg;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -124,7 +130,10 @@ int RIFE::load(const std::string& modeldir)
|
|||
ncnn::MutexLockGuard guard(lock);
|
||||
if (spirv.empty())
|
||||
{
|
||||
compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
|
||||
if (tta_mode)
|
||||
compile_spirv_module(rife_preproc_tta_comp_data, sizeof(rife_preproc_tta_comp_data), opt, spirv);
|
||||
else
|
||||
compile_spirv_module(rife_preproc_comp_data, sizeof(rife_preproc_comp_data), opt, spirv);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -140,7 +149,10 @@ int RIFE::load(const std::string& modeldir)
|
|||
ncnn::MutexLockGuard guard(lock);
|
||||
if (spirv.empty())
|
||||
{
|
||||
compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
|
||||
if (tta_mode)
|
||||
compile_spirv_module(rife_postproc_tta_comp_data, sizeof(rife_postproc_tta_comp_data), opt, spirv);
|
||||
else
|
||||
compile_spirv_module(rife_postproc_comp_data, sizeof(rife_postproc_comp_data), opt, spirv);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,6 +162,25 @@ int RIFE::load(const std::string& modeldir)
|
|||
}
|
||||
}
|
||||
|
||||
if (tta_mode)
|
||||
{
|
||||
static std::vector<uint32_t> spirv;
|
||||
static ncnn::Mutex lock;
|
||||
{
|
||||
ncnn::MutexLockGuard guard(lock);
|
||||
if (spirv.empty())
|
||||
{
|
||||
compile_spirv_module(rife_flow_tta_avg_comp_data, sizeof(rife_flow_tta_avg_comp_data), opt, spirv);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<ncnn::vk_specialization_type> specializations(0);
|
||||
|
||||
rife_flow_tta_avg = new ncnn::Pipeline(vkdev);
|
||||
rife_flow_tta_avg->set_optimal_local_size_xyz(8, 8, 1);
|
||||
rife_flow_tta_avg->create(spirv.data(), spirv.size() * 4, specializations);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -217,148 +248,370 @@ int RIFE::process(const ncnn::Mat& in0image, const ncnn::Mat& in1image, float ti
|
|||
cmd.record_clone(in1, in1_gpu, opt);
|
||||
}
|
||||
|
||||
// preproc
|
||||
ncnn::VkMat in0_gpu_padded;
|
||||
ncnn::VkMat in1_gpu_padded;
|
||||
{
|
||||
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = in0_gpu;
|
||||
bindings[1] = in0_gpu_padded;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in0_gpu.w;
|
||||
constants[1].i = in0_gpu.h;
|
||||
constants[2].i = in0_gpu.cstep;
|
||||
constants[3].i = in0_gpu_padded.w;
|
||||
constants[4].i = in0_gpu_padded.h;
|
||||
constants[5].i = in0_gpu_padded.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
|
||||
}
|
||||
{
|
||||
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = in1_gpu;
|
||||
bindings[1] = in1_gpu_padded;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in1_gpu.w;
|
||||
constants[1].i = in1_gpu.h;
|
||||
constants[2].i = in1_gpu.cstep;
|
||||
constants[3].i = in1_gpu_padded.w;
|
||||
constants[4].i = in1_gpu_padded.h;
|
||||
constants[5].i = in1_gpu_padded.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
|
||||
}
|
||||
|
||||
// flownet
|
||||
ncnn::VkMat flow;
|
||||
{
|
||||
ncnn::Extractor ex = flownet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input0", in0_gpu_padded);
|
||||
ex.input("input1", in1_gpu_padded);
|
||||
ex.extract("flow", flow, cmd);
|
||||
}
|
||||
|
||||
// contextnet
|
||||
ncnn::VkMat ctx0[4];
|
||||
ncnn::VkMat ctx1[4];
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in0_gpu_padded);
|
||||
ex.input("flow.0", flow);
|
||||
ex.extract("f1", ctx0[0], cmd);
|
||||
ex.extract("f2", ctx0[1], cmd);
|
||||
ex.extract("f3", ctx0[2], cmd);
|
||||
ex.extract("f4", ctx0[3], cmd);
|
||||
}
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in1_gpu_padded);
|
||||
ex.input("flow.1", flow);
|
||||
ex.extract("f1", ctx1[0], cmd);
|
||||
ex.extract("f2", ctx1[1], cmd);
|
||||
ex.extract("f3", ctx1[2], cmd);
|
||||
ex.extract("f4", ctx1[3], cmd);
|
||||
}
|
||||
|
||||
// fusionnet
|
||||
ncnn::VkMat out_gpu_padded;
|
||||
{
|
||||
ncnn::Extractor ex = fusionnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("img0", in0_gpu_padded);
|
||||
ex.input("img1", in1_gpu_padded);
|
||||
ex.input("flow", flow);
|
||||
ex.input("3", ctx0[0]);
|
||||
ex.input("4", ctx0[1]);
|
||||
ex.input("5", ctx0[2]);
|
||||
ex.input("6", ctx0[3]);
|
||||
ex.input("7", ctx1[0]);
|
||||
ex.input("8", ctx1[1]);
|
||||
ex.input("9", ctx1[2]);
|
||||
ex.input("10", ctx1[3]);
|
||||
|
||||
// save some memory
|
||||
in0_gpu.release();
|
||||
in1_gpu.release();
|
||||
flow.release();
|
||||
ctx0[0].release();
|
||||
ctx0[1].release();
|
||||
ctx0[2].release();
|
||||
ctx0[3].release();
|
||||
ctx1[0].release();
|
||||
ctx1[1].release();
|
||||
ctx1[2].release();
|
||||
ctx1[3].release();
|
||||
|
||||
ex.extract("output", out_gpu_padded, cmd);
|
||||
}
|
||||
|
||||
ncnn::VkMat out_gpu;
|
||||
if (opt.use_fp16_storage && opt.use_int8_storage)
|
||||
|
||||
if (tta_mode)
|
||||
{
|
||||
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
|
||||
// preproc
|
||||
ncnn::VkMat in0_gpu_padded[8];
|
||||
ncnn::VkMat in1_gpu_padded[8];
|
||||
{
|
||||
in0_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in0_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
std::vector<ncnn::VkMat> bindings(9);
|
||||
bindings[0] = in0_gpu;
|
||||
bindings[1] = in0_gpu_padded[0];
|
||||
bindings[2] = in0_gpu_padded[1];
|
||||
bindings[3] = in0_gpu_padded[2];
|
||||
bindings[4] = in0_gpu_padded[3];
|
||||
bindings[5] = in0_gpu_padded[4];
|
||||
bindings[6] = in0_gpu_padded[5];
|
||||
bindings[7] = in0_gpu_padded[6];
|
||||
bindings[8] = in0_gpu_padded[7];
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in0_gpu.w;
|
||||
constants[1].i = in0_gpu.h;
|
||||
constants[2].i = in0_gpu.cstep;
|
||||
constants[3].i = in0_gpu_padded[0].w;
|
||||
constants[4].i = in0_gpu_padded[0].h;
|
||||
constants[5].i = in0_gpu_padded[0].cstep;
|
||||
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded[0]);
|
||||
}
|
||||
{
|
||||
in1_gpu_padded[0].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[1].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[2].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[3].create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[4].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[5].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[6].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
in1_gpu_padded[7].create(h_padded, w_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
std::vector<ncnn::VkMat> bindings(9);
|
||||
bindings[0] = in1_gpu;
|
||||
bindings[1] = in1_gpu_padded[0];
|
||||
bindings[2] = in1_gpu_padded[1];
|
||||
bindings[3] = in1_gpu_padded[2];
|
||||
bindings[4] = in1_gpu_padded[3];
|
||||
bindings[5] = in1_gpu_padded[4];
|
||||
bindings[6] = in1_gpu_padded[5];
|
||||
bindings[7] = in1_gpu_padded[6];
|
||||
bindings[8] = in1_gpu_padded[7];
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in1_gpu.w;
|
||||
constants[1].i = in1_gpu.h;
|
||||
constants[2].i = in1_gpu.cstep;
|
||||
constants[3].i = in1_gpu_padded[0].w;
|
||||
constants[4].i = in1_gpu_padded[0].h;
|
||||
constants[5].i = in1_gpu_padded[0].cstep;
|
||||
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded[0]);
|
||||
}
|
||||
|
||||
ncnn::VkMat flow[8];
|
||||
for (int ti = 0; ti < 8; ti++)
|
||||
{
|
||||
// flownet
|
||||
{
|
||||
ncnn::Extractor ex = flownet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input0", in0_gpu_padded[ti]);
|
||||
ex.input("input1", in1_gpu_padded[ti]);
|
||||
ex.extract("flow", flow[ti], cmd);
|
||||
}
|
||||
}
|
||||
|
||||
// avg flow
|
||||
{
|
||||
std::vector<ncnn::VkMat> bindings(8);
|
||||
bindings[0] = flow[0];
|
||||
bindings[1] = flow[1];
|
||||
bindings[2] = flow[2];
|
||||
bindings[3] = flow[3];
|
||||
bindings[4] = flow[4];
|
||||
bindings[5] = flow[5];
|
||||
bindings[6] = flow[6];
|
||||
bindings[7] = flow[7];
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(3);
|
||||
constants[0].i = flow[0].w;
|
||||
constants[1].i = flow[0].h;
|
||||
constants[2].i = flow[0].cstep;
|
||||
|
||||
ncnn::VkMat dispatcher;
|
||||
dispatcher.w = flow[0].w;
|
||||
dispatcher.h = flow[0].h;
|
||||
dispatcher.c = 1;
|
||||
cmd.record_pipeline(rife_flow_tta_avg, bindings, constants, dispatcher);
|
||||
}
|
||||
|
||||
ncnn::VkMat out_gpu_padded[8];
|
||||
for (int ti = 0; ti < 8; ti++)
|
||||
{
|
||||
// contextnet
|
||||
ncnn::VkMat ctx0[4];
|
||||
ncnn::VkMat ctx1[4];
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in0_gpu_padded[ti]);
|
||||
ex.input("flow.0", flow[ti]);
|
||||
ex.extract("f1", ctx0[0], cmd);
|
||||
ex.extract("f2", ctx0[1], cmd);
|
||||
ex.extract("f3", ctx0[2], cmd);
|
||||
ex.extract("f4", ctx0[3], cmd);
|
||||
}
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in1_gpu_padded[ti]);
|
||||
ex.input("flow.1", flow[ti]);
|
||||
ex.extract("f1", ctx1[0], cmd);
|
||||
ex.extract("f2", ctx1[1], cmd);
|
||||
ex.extract("f3", ctx1[2], cmd);
|
||||
ex.extract("f4", ctx1[3], cmd);
|
||||
}
|
||||
|
||||
// fusionnet
|
||||
{
|
||||
ncnn::Extractor ex = fusionnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("img0", in0_gpu_padded[ti]);
|
||||
ex.input("img1", in1_gpu_padded[ti]);
|
||||
ex.input("flow", flow[ti]);
|
||||
ex.input("3", ctx0[0]);
|
||||
ex.input("4", ctx0[1]);
|
||||
ex.input("5", ctx0[2]);
|
||||
ex.input("6", ctx0[3]);
|
||||
ex.input("7", ctx1[0]);
|
||||
ex.input("8", ctx1[1]);
|
||||
ex.input("9", ctx1[2]);
|
||||
ex.input("10", ctx1[3]);
|
||||
|
||||
// save some memory
|
||||
if (ti == 0)
|
||||
{
|
||||
in0_gpu.release();
|
||||
in1_gpu.release();
|
||||
}
|
||||
else
|
||||
{
|
||||
in0_gpu_padded[ti - 1].release();
|
||||
in1_gpu_padded[ti - 1].release();
|
||||
flow[ti - 1].release();
|
||||
}
|
||||
ctx0[0].release();
|
||||
ctx0[1].release();
|
||||
ctx0[2].release();
|
||||
ctx0[3].release();
|
||||
ctx1[0].release();
|
||||
ctx1[1].release();
|
||||
ctx1[2].release();
|
||||
ctx1[3].release();
|
||||
|
||||
ex.extract("output", out_gpu_padded[ti], cmd);
|
||||
}
|
||||
}
|
||||
|
||||
if (opt.use_fp16_storage && opt.use_int8_storage)
|
||||
{
|
||||
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
|
||||
}
|
||||
|
||||
// postproc
|
||||
{
|
||||
std::vector<ncnn::VkMat> bindings(9);
|
||||
bindings[0] = out_gpu_padded[0];
|
||||
bindings[1] = out_gpu_padded[1];
|
||||
bindings[2] = out_gpu_padded[2];
|
||||
bindings[3] = out_gpu_padded[3];
|
||||
bindings[4] = out_gpu_padded[4];
|
||||
bindings[5] = out_gpu_padded[5];
|
||||
bindings[6] = out_gpu_padded[6];
|
||||
bindings[7] = out_gpu_padded[7];
|
||||
bindings[8] = out_gpu;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = out_gpu_padded[0].w;
|
||||
constants[1].i = out_gpu_padded[0].h;
|
||||
constants[2].i = out_gpu_padded[0].cstep;
|
||||
constants[3].i = out_gpu.w;
|
||||
constants[4].i = out_gpu.h;
|
||||
constants[5].i = out_gpu.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
|
||||
}
|
||||
// preproc
|
||||
ncnn::VkMat in0_gpu_padded;
|
||||
ncnn::VkMat in1_gpu_padded;
|
||||
{
|
||||
in0_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
// postproc
|
||||
{
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = out_gpu_padded;
|
||||
bindings[1] = out_gpu;
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = in0_gpu;
|
||||
bindings[1] = in0_gpu_padded;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = out_gpu_padded.w;
|
||||
constants[1].i = out_gpu_padded.h;
|
||||
constants[2].i = out_gpu_padded.cstep;
|
||||
constants[3].i = out_gpu.w;
|
||||
constants[4].i = out_gpu.h;
|
||||
constants[5].i = out_gpu.cstep;
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in0_gpu.w;
|
||||
constants[1].i = in0_gpu.h;
|
||||
constants[2].i = in0_gpu.cstep;
|
||||
constants[3].i = in0_gpu_padded.w;
|
||||
constants[4].i = in0_gpu_padded.h;
|
||||
constants[5].i = in0_gpu_padded.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in0_gpu_padded);
|
||||
}
|
||||
{
|
||||
in1_gpu_padded.create(w_padded, h_padded, 3, in_out_tile_elemsize, 1, blob_vkallocator);
|
||||
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = in1_gpu;
|
||||
bindings[1] = in1_gpu_padded;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = in1_gpu.w;
|
||||
constants[1].i = in1_gpu.h;
|
||||
constants[2].i = in1_gpu.cstep;
|
||||
constants[3].i = in1_gpu_padded.w;
|
||||
constants[4].i = in1_gpu_padded.h;
|
||||
constants[5].i = in1_gpu_padded.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_preproc, bindings, constants, in1_gpu_padded);
|
||||
}
|
||||
|
||||
// flownet
|
||||
ncnn::VkMat flow;
|
||||
{
|
||||
ncnn::Extractor ex = flownet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input0", in0_gpu_padded);
|
||||
ex.input("input1", in1_gpu_padded);
|
||||
ex.extract("flow", flow, cmd);
|
||||
}
|
||||
|
||||
// contextnet
|
||||
ncnn::VkMat ctx0[4];
|
||||
ncnn::VkMat ctx1[4];
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in0_gpu_padded);
|
||||
ex.input("flow.0", flow);
|
||||
ex.extract("f1", ctx0[0], cmd);
|
||||
ex.extract("f2", ctx0[1], cmd);
|
||||
ex.extract("f3", ctx0[2], cmd);
|
||||
ex.extract("f4", ctx0[3], cmd);
|
||||
}
|
||||
{
|
||||
ncnn::Extractor ex = contextnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("input.1", in1_gpu_padded);
|
||||
ex.input("flow.1", flow);
|
||||
ex.extract("f1", ctx1[0], cmd);
|
||||
ex.extract("f2", ctx1[1], cmd);
|
||||
ex.extract("f3", ctx1[2], cmd);
|
||||
ex.extract("f4", ctx1[3], cmd);
|
||||
}
|
||||
|
||||
// fusionnet
|
||||
ncnn::VkMat out_gpu_padded;
|
||||
{
|
||||
ncnn::Extractor ex = fusionnet.create_extractor();
|
||||
ex.set_blob_vkallocator(blob_vkallocator);
|
||||
ex.set_workspace_vkallocator(blob_vkallocator);
|
||||
ex.set_staging_vkallocator(staging_vkallocator);
|
||||
|
||||
ex.input("img0", in0_gpu_padded);
|
||||
ex.input("img1", in1_gpu_padded);
|
||||
ex.input("flow", flow);
|
||||
ex.input("3", ctx0[0]);
|
||||
ex.input("4", ctx0[1]);
|
||||
ex.input("5", ctx0[2]);
|
||||
ex.input("6", ctx0[3]);
|
||||
ex.input("7", ctx1[0]);
|
||||
ex.input("8", ctx1[1]);
|
||||
ex.input("9", ctx1[2]);
|
||||
ex.input("10", ctx1[3]);
|
||||
|
||||
// save some memory
|
||||
in0_gpu.release();
|
||||
in1_gpu.release();
|
||||
flow.release();
|
||||
ctx0[0].release();
|
||||
ctx0[1].release();
|
||||
ctx0[2].release();
|
||||
ctx0[3].release();
|
||||
ctx1[0].release();
|
||||
ctx1[1].release();
|
||||
ctx1[2].release();
|
||||
ctx1[3].release();
|
||||
|
||||
ex.extract("output", out_gpu_padded, cmd);
|
||||
}
|
||||
|
||||
if (opt.use_fp16_storage && opt.use_int8_storage)
|
||||
{
|
||||
out_gpu.create(w, h, (size_t)channels, 1, blob_vkallocator);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_gpu.create(w, h, channels, (size_t)4u, 1, blob_vkallocator);
|
||||
}
|
||||
|
||||
// postproc
|
||||
{
|
||||
std::vector<ncnn::VkMat> bindings(2);
|
||||
bindings[0] = out_gpu_padded;
|
||||
bindings[1] = out_gpu;
|
||||
|
||||
std::vector<ncnn::vk_constant_type> constants(6);
|
||||
constants[0].i = out_gpu_padded.w;
|
||||
constants[1].i = out_gpu_padded.h;
|
||||
constants[2].i = out_gpu_padded.cstep;
|
||||
constants[3].i = out_gpu.w;
|
||||
constants[4].i = out_gpu.h;
|
||||
constants[5].i = out_gpu.cstep;
|
||||
|
||||
cmd.record_pipeline(rife_postproc, bindings, constants, out_gpu);
|
||||
}
|
||||
}
|
||||
|
||||
// download
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
class RIFE
|
||||
{
|
||||
public:
|
||||
RIFE(int gpuid);
|
||||
RIFE(int gpuid, bool tta_mode = false);
|
||||
~RIFE();
|
||||
|
||||
#if _WIN32
|
||||
|
@ -29,6 +29,8 @@ private:
|
|||
ncnn::Net fusionnet;
|
||||
ncnn::Pipeline* rife_preproc;
|
||||
ncnn::Pipeline* rife_postproc;
|
||||
ncnn::Pipeline* rife_flow_tta_avg;
|
||||
bool tta_mode;
|
||||
};
|
||||
|
||||
#endif // RIFE_H
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
// rife implemented with ncnn library
|
||||
|
||||
#version 450
|
||||
|
||||
#if NCNN_fp16_storage
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#endif
|
||||
|
||||
layout (binding = 0) buffer flow_blob0 { sfp flow_blob0_data[]; };
|
||||
layout (binding = 1) buffer flow_blob1 { sfp flow_blob1_data[]; };
|
||||
layout (binding = 2) buffer flow_blob2 { sfp flow_blob2_data[]; };
|
||||
layout (binding = 3) buffer flow_blob3 { sfp flow_blob3_data[]; };
|
||||
layout (binding = 4) buffer flow_blob4 { sfp flow_blob4_data[]; };
|
||||
layout (binding = 5) buffer flow_blob5 { sfp flow_blob5_data[]; };
|
||||
layout (binding = 6) buffer flow_blob6 { sfp flow_blob6_data[]; };
|
||||
layout (binding = 7) buffer flow_blob7 { sfp flow_blob7_data[]; };
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int w;
|
||||
int h;
|
||||
int cstep;
|
||||
} p;
|
||||
|
||||
void main()
|
||||
{
|
||||
int gx = int(gl_GlobalInvocationID.x);
|
||||
int gy = int(gl_GlobalInvocationID.y);
|
||||
int gz = int(gl_GlobalInvocationID.z);
|
||||
|
||||
if (gx >= p.w || gy >= p.h || gz >= 1)
|
||||
return;
|
||||
|
||||
float x0 = float(flow_blob0_data[gy * p.w + gx]);
|
||||
float x1 = float(flow_blob1_data[gy * p.w + (p.w - 1 - gx)]);
|
||||
float x2 = float(flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
|
||||
float x3 = float(flow_blob3_data[(p.h - 1 - gy) * p.w + gx]);
|
||||
float x4 = float(flow_blob4_data[gx * p.h + gy]);
|
||||
float x5 = float(flow_blob5_data[gx * p.h + (p.h - 1 - gy)]);
|
||||
float x6 = float(flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
|
||||
float x7 = float(flow_blob7_data[(p.w - 1 - gx) * p.h + gy]);
|
||||
|
||||
float y0 = float(flow_blob0_data[p.cstep + gy * p.w + gx]);
|
||||
float y1 = float(flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)]);
|
||||
float y2 = float(flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
|
||||
float y3 = float(flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx]);
|
||||
float y4 = float(flow_blob4_data[p.cstep + gx * p.h + gy]);
|
||||
float y5 = float(flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)]);
|
||||
float y6 = float(flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
|
||||
float y7 = float(flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy]);
|
||||
|
||||
float x = (x0 + -x1 + -x2 + x3 + y4 + y5 + -y6 + -y7) * 0.125f;
|
||||
float y = (y0 + y1 + -y2 + -y3 + x4 + -x5 + -x6 + x7) * 0.125f;
|
||||
|
||||
flow_blob0_data[gy * p.w + gx] = sfp(x);
|
||||
flow_blob1_data[gy * p.w + (p.w - 1 - gx)] = sfp(-x);
|
||||
flow_blob2_data[(p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-x);
|
||||
flow_blob3_data[(p.h - 1 - gy) * p.w + gx] = sfp(x);
|
||||
flow_blob4_data[gx * p.h + gy] = sfp(y);
|
||||
flow_blob5_data[gx * p.h + (p.h - 1 - gy)] = sfp(-y);
|
||||
flow_blob6_data[(p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-y);
|
||||
flow_blob7_data[(p.w - 1 - gx) * p.h + gy] = sfp(y);
|
||||
|
||||
flow_blob0_data[p.cstep + gy * p.w + gx] = sfp(y);
|
||||
flow_blob1_data[p.cstep + gy * p.w + (p.w - 1 - gx)] = sfp(y);
|
||||
flow_blob2_data[p.cstep + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)] = sfp(-y);
|
||||
flow_blob3_data[p.cstep + (p.h - 1 - gy) * p.w + gx] = sfp(-y);
|
||||
flow_blob4_data[p.cstep + gx * p.h + gy] = sfp(x);
|
||||
flow_blob5_data[p.cstep + gx * p.h + (p.h - 1 - gy)] = sfp(x);
|
||||
flow_blob6_data[p.cstep + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)] = sfp(-x);
|
||||
flow_blob7_data[p.cstep + (p.w - 1 - gx) * p.h + gy] = sfp(-x);
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
// rife implemented with ncnn library
|
||||
|
||||
#version 450
|
||||
|
||||
#if NCNN_fp16_storage
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#endif
|
||||
|
||||
#if NCNN_int8_storage
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#endif
|
||||
|
||||
layout (constant_id = 0) const int bgr = 0;
|
||||
|
||||
layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
|
||||
layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
|
||||
layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
|
||||
layout (binding = 3) readonly buffer bottom_blob3 { sfp bottom_blob3_data[]; };
|
||||
layout (binding = 4) readonly buffer bottom_blob4 { sfp bottom_blob4_data[]; };
|
||||
layout (binding = 5) readonly buffer bottom_blob5 { sfp bottom_blob5_data[]; };
|
||||
layout (binding = 6) readonly buffer bottom_blob6 { sfp bottom_blob6_data[]; };
|
||||
layout (binding = 7) readonly buffer bottom_blob7 { sfp bottom_blob7_data[]; };
|
||||
#if NCNN_int8_storage
|
||||
layout (binding = 8) writeonly buffer top_blob { uint8_t top_blob_data[]; };
|
||||
#else
|
||||
layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; };
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int w;
|
||||
int h;
|
||||
int cstep;
|
||||
|
||||
int outw;
|
||||
int outh;
|
||||
int outcstep;
|
||||
} p;
|
||||
|
||||
void main()
|
||||
{
|
||||
int gx = int(gl_GlobalInvocationID.x);
|
||||
int gy = int(gl_GlobalInvocationID.y);
|
||||
int gz = int(gl_GlobalInvocationID.z);
|
||||
|
||||
if (gx >= p.outw || gy >= p.outh || gz >= 3)
|
||||
return;
|
||||
|
||||
int gzi = gz * p.cstep;
|
||||
|
||||
float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]);
|
||||
float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]);
|
||||
float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
|
||||
float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]);
|
||||
float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]);
|
||||
float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]);
|
||||
float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
|
||||
float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]);
|
||||
|
||||
float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
|
||||
|
||||
const float denorm_val = 255.f;
|
||||
const float clip_eps = 0.5f;
|
||||
|
||||
v = v * denorm_val + clip_eps;
|
||||
|
||||
#if NCNN_int8_storage
|
||||
int v_offset = gy * p.outw + gx;
|
||||
|
||||
uint v32 = clamp(uint(floor(v)), 0, 255);
|
||||
|
||||
if (bgr == 0)
|
||||
top_blob_data[v_offset * 3 + gz] = uint8_t(v32);
|
||||
else
|
||||
top_blob_data[v_offset * 3 + 2 - gz] = uint8_t(v32);
|
||||
#else
|
||||
int v_offset = gz * p.outcstep + gy * p.outw + gx;
|
||||
|
||||
top_blob_data[v_offset] = v;
|
||||
#endif
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
// rife implemented with ncnn library
|
||||
|
||||
#version 450
|
||||
|
||||
#if NCNN_fp16_storage
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#endif
|
||||
|
||||
#if NCNN_int8_storage
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#endif
|
||||
|
||||
layout (constant_id = 0) const int bgr = 0;
|
||||
|
||||
#if NCNN_int8_storage
|
||||
layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
|
||||
#else
|
||||
layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
|
||||
#endif
|
||||
layout (binding = 1) writeonly buffer top_blob0 { sfp top_blob0_data[]; };
|
||||
layout (binding = 2) writeonly buffer top_blob1 { sfp top_blob1_data[]; };
|
||||
layout (binding = 3) writeonly buffer top_blob2 { sfp top_blob2_data[]; };
|
||||
layout (binding = 4) writeonly buffer top_blob3 { sfp top_blob3_data[]; };
|
||||
layout (binding = 5) writeonly buffer top_blob4 { sfp top_blob4_data[]; };
|
||||
layout (binding = 6) writeonly buffer top_blob5 { sfp top_blob5_data[]; };
|
||||
layout (binding = 7) writeonly buffer top_blob6 { sfp top_blob6_data[]; };
|
||||
layout (binding = 8) writeonly buffer top_blob7 { sfp top_blob7_data[]; };
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
int w;
|
||||
int h;
|
||||
int cstep;
|
||||
|
||||
int outw;
|
||||
int outh;
|
||||
int outcstep;
|
||||
} p;
|
||||
|
||||
void main()
|
||||
{
|
||||
int gx = int(gl_GlobalInvocationID.x);
|
||||
int gy = int(gl_GlobalInvocationID.y);
|
||||
int gz = int(gl_GlobalInvocationID.z);
|
||||
|
||||
if (gx >= p.outw || gy >= p.outh || gz >= 3)
|
||||
return;
|
||||
|
||||
if (gx < 0 || gx >= p.w || gy < 0 || gy >= p.h)
|
||||
{
|
||||
int gzi = gz * p.outcstep;
|
||||
|
||||
top_blob0_data[gzi + gy * p.outw + gx] = sfp(0.f);
|
||||
top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
|
||||
top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(0.f);
|
||||
top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(0.f);
|
||||
top_blob4_data[gzi + gx * p.outh + gy] = sfp(0.f);
|
||||
top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
|
||||
top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(0.f);
|
||||
top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(0.f);
|
||||
return;
|
||||
}
|
||||
|
||||
#if NCNN_int8_storage
|
||||
int v_offset = gy * p.w + gx;
|
||||
|
||||
float v;
|
||||
|
||||
if (bgr == 0)
|
||||
v = float(uint(bottom_blob_data[v_offset * 3 + gz]));
|
||||
else
|
||||
v = float(uint(bottom_blob_data[v_offset * 3 + 2 - gz]));
|
||||
#else
|
||||
int v_offset = gz * p.cstep + gy * p.w + gx;
|
||||
|
||||
float v = bottom_blob_data[v_offset];
|
||||
#endif
|
||||
|
||||
const float norm_val = 1 / 255.f;
|
||||
|
||||
v = v * norm_val;
|
||||
|
||||
int gzi = gz * p.outcstep;
|
||||
|
||||
top_blob0_data[gzi + gy * p.outw + gx] = sfp(v);
|
||||
top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = sfp(v);
|
||||
top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = sfp(v);
|
||||
top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = sfp(v);
|
||||
top_blob4_data[gzi + gx * p.outh + gy] = sfp(v);
|
||||
top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = sfp(v);
|
||||
top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = sfp(v);
|
||||
top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = sfp(v);
|
||||
}
|
Loading…
Reference in New Issue