diff options
author | Lynne <dev@lynne.ee> | 2023-04-13 12:15:13 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2023-05-29 00:42:01 +0200 |
commit | 160a415e22af417d81072dc3cb616649aea2388b (patch) | |
tree | b562b95ac196693d1785cb7123e8e6523f123f77 | |
parent | dfff3877b70b3d4493312efdea6d00ded0458f49 (diff) | |
download | ffmpeg-160a415e22af417d81072dc3cb616649aea2388b.tar.gz |
lavfi: add nlmeans_vulkan filter
-rwxr-xr-x | configure | 1 | ||||
-rw-r--r-- | libavfilter/Makefile | 2 | ||||
-rw-r--r-- | libavfilter/allfilters.c | 1 | ||||
-rw-r--r-- | libavfilter/vf_nlmeans_vulkan.c | 1122 | ||||
-rw-r--r-- | libavfilter/vulkan/prefix_sum.comp | 151 | ||||
-rw-r--r-- | libavutil/vulkan_functions.h | 1 |
6 files changed, 1278 insertions, 0 deletions
@@ -3705,6 +3705,7 @@ minterpolate_filter_select="scene_sad" mptestsrc_filter_deps="gpl" negate_filter_deps="lut_filter" nlmeans_opencl_filter_deps="opencl" +nlmeans_vulkan_filter_deps="vulkan spirv_compiler" nnedi_filter_deps="gpl" ocr_filter_deps="libtesseract" ocv_filter_deps="libopencv" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 01c083e77d..18935b1616 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -390,6 +390,8 @@ OBJS-$(CONFIG_MULTIPLY_FILTER) += vf_multiply.o OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o OBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o +OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \ + vulkan/prefix_sum.o OBJS-$(CONFIG_NNEDI_FILTER) += vf_nnedi.o OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 30a8830f68..f1f781101b 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -368,6 +368,7 @@ extern const AVFilter ff_vf_multiply; extern const AVFilter ff_vf_negate; extern const AVFilter ff_vf_nlmeans; extern const AVFilter ff_vf_nlmeans_opencl; +extern const AVFilter ff_vf_nlmeans_vulkan; extern const AVFilter ff_vf_nnedi; extern const AVFilter ff_vf_noformat; extern const AVFilter ff_vf_noise; diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c new file mode 100644 index 0000000000..97d894c9bb --- /dev/null +++ b/libavfilter/vf_nlmeans_vulkan.c @@ -0,0 +1,1122 @@ +/* + * Copyright (c) Lynne + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/random_seed.h" +#include "libavutil/opt.h" +#include "vulkan_filter.h" +#include "vulkan_spirv.h" +#include "internal.h" + +#define TYPE_NAME "vec4" +#define TYPE_ELEMS 4 +#define TYPE_SIZE (TYPE_ELEMS*4) + +typedef struct NLMeansVulkanContext { + FFVulkanContext vkctx; + + int initialized; + FFVkExecPool e; + FFVkQueueFamilyCtx qf; + VkSampler sampler; + + AVBufferPool *integral_buf_pool; + AVBufferPool *state_buf_pool; + AVBufferPool *ws_buf_pool; + + int pl_weights_rows; + FFVulkanPipeline pl_weights; + FFVkSPIRVShader shd_weights; + + FFVulkanPipeline pl_denoise; + FFVkSPIRVShader shd_denoise; + + int *xoffsets; + int *yoffsets; + int nb_offsets; + float strength[4]; + int patch[4]; + + struct nlmeans_opts { + int r; + double s; + double sc[4]; + int p; + int pc[4]; + int t; + } opts; +} NLMeansVulkanContext; + +extern const char *ff_source_prefix_sum_comp; + +static void insert_first(FFVkSPIRVShader *shd, int r, int horiz, int plane, int comp) +{ + GLSLF(2, s1 = texture(input_img[%i], ivec2(x + %i, y + %i))[%i]; + ,plane, horiz ? r : 0, !horiz ? r : 0, comp); + + if (TYPE_ELEMS == 4) { + GLSLF(2, s2[0] = texture(input_img[%i], ivec2(x + %i + xoffs[0], y + %i + yoffs[0]))[%i]; + ,plane, horiz ? r : 0, !horiz ? r : 0, comp); + GLSLF(2, s2[1] = texture(input_img[%i], ivec2(x + %i + xoffs[1], y + %i + yoffs[1]))[%i]; + ,plane, horiz ? r : 0, !horiz ? r : 0, comp); + GLSLF(2, s2[2] = texture(input_img[%i], ivec2(x + %i + xoffs[2], y + %i + yoffs[2]))[%i]; + ,plane, horiz ? r : 0, !horiz ? r : 0, comp); + GLSLF(2, s2[3] = texture(input_img[%i], ivec2(x + %i + xoffs[3], y + %i + yoffs[3]))[%i]; + ,plane, horiz ? r : 0, !horiz ? r : 0, comp); + } else { + for (int i = 0; i < 16; i++) { + GLSLF(2, s2[%i][%i] = texture(input_img[%i], ivec2(x + %i + xoffs[%i], y + %i + yoffs[%i]))[%i]; + ,i / 4, i % 4, plane, horiz ? r : 0, i, !horiz ? r : 0, i, comp); + } + } + + GLSLC(2, s2 = (s1 - s2) * (s1 - s2); ); +} + +static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) +{ + GLSLF(1, x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); + if (!first) { + GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | + gl_SemanticsMakeAvailable | + gl_SemanticsMakeVisible); ); + } + GLSLC(1, for (y = 0; y < height[0]; y++) { ); + GLSLC(2, offset = uint64_t(int_stride)*y*T_ALIGN; ); + GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + offset); ); + GLSLC(0, ); + if (first) { + for (int r = 0; r < nb_rows; r++) { + insert_first(shd, r, 1, plane, comp); + GLSLF(2, dst.v[x + %i] = s2; ,r); + GLSLC(0, ); + } + } + GLSLC(2, barrier(); ); + GLSLC(2, prefix_sum(dst, 1, dst, 1); ); + GLSLC(1, } ); + GLSLC(0, ); +} + +static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) +{ + GLSLF(1, y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); + if (!first) { + GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | + gl_SemanticsMakeAvailable | + gl_SemanticsMakeVisible); ); + } + GLSLC(1, for (x = 0; x < width[0]; x++) { ); + GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN); ); + + for (int r = 0; r < nb_rows; r++) { + if (first) { + insert_first(shd, r, 0, plane, comp); + GLSLF(2, integral_data.v[(y + %i)*int_stride + x] = s2; ,r); + GLSLC(0, ); + } + } + + GLSLC(2, barrier(); ); + GLSLC(2, prefix_sum(dst, int_stride, dst, int_stride); ); + GLSLC(1, } ); + GLSLC(0, ); +} + +static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, + int t, int dst_comp, int plane, int comp) +{ + GLSLF(1, p = patch_size[%i]; ,dst_comp); + GLSLC(0, ); + GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, + gl_StorageSemanticsBuffer, + gl_SemanticsAcquireRelease | + gl_SemanticsMakeAvailable | + gl_SemanticsMakeVisible); ); + GLSLC(1, barrier(); ); + if (!vert) { + GLSLC(1, for (y = 0; y < height[0]; y++) { ); + GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane); + GLSLC(3, break; ); + GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); + GLSLF(3, x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); + } else { + GLSLC(1, for (x = 0; x < width[0]; x++) { ); + GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane); + GLSLC(3, break; ); + GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); + GLSLF(3, y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); + } + GLSLC(0, ); + GLSLC(3, a = DTYPE(0); ); + GLSLC(3, b = DTYPE(0); ); + GLSLC(3, c = DTYPE(0); ); + GLSLC(3, d = DTYPE(0); ); + GLSLC(0, ); + GLSLC(3, lt = ((x - p) < 0) || ((y - p) < 0); ); + GLSLC(0, ); + if (TYPE_ELEMS == 4) { + GLSLF(3, src[0] = texture(input_img[%i], ivec2(x + xoffs[0], y + yoffs[0]))[%i]; ,plane, comp); + GLSLF(3, src[1] = texture(input_img[%i], ivec2(x + xoffs[1], y + yoffs[1]))[%i]; ,plane, comp); + GLSLF(3, src[2] = texture(input_img[%i], ivec2(x + xoffs[2], y + yoffs[2]))[%i]; ,plane, comp); + GLSLF(3, src[3] = texture(input_img[%i], ivec2(x + xoffs[3], y + yoffs[3]))[%i]; ,plane, comp); + } else { + for (int i = 0; i < 16; i++) + GLSLF(3, src[%i][%i] = texture(input_img[%i], ivec2(x + xoffs[%i], y + yoffs[%i]))[%i]; + ,i / 4, i % 4, plane, i, i, comp); + + } + GLSLC(0, ); + GLSLC(3, if (lt == false) { ); + GLSLC(4, a = integral_data.v[(y - p)*int_stride + x - p]; ); + GLSLC(4, c = integral_data.v[(y - p)*int_stride + x + p]; ); + GLSLC(4, b = integral_data.v[(y + p)*int_stride + x - p]; ); + GLSLC(4, d = integral_data.v[(y + p)*int_stride + x + p]; ); + GLSLC(3, } ); + GLSLC(0, ); + GLSLC(3, patch_diff = d + a - b - c; ); + if (TYPE_ELEMS == 4) { + GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp); + GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; ); + GLSLC(3, sum = dot(w, src*255); ); + } else { + for (int i = 0; i < 4; i++) + GLSLF(3, w[%i] = exp(patch_diff[%i] * strength[%i]); ,i,i,dst_comp); + for (int i = 0; i < 4; i++) + GLSLF(3, w_sum %s w[%i][0] + w[%i][1] + w[%i][2] + w[%i][3]; + ,!i ? "=" : "+=", i, i, i, i); + for (int i = 0; i < 4; i++) + GLSLF(3, sum %s dot(w[%i], src[%i]*255); + ,!i ? "=" : "+=", i, i); + } + GLSLC(0, ); + if (t > 1) { + GLSLF(3, atomicAdd(weights_%i[y*ws_stride[%i] + x], w_sum); ,dst_comp, dst_comp); + GLSLF(3, atomicAdd(sums_%i[y*ws_stride[%i] + x], sum); ,dst_comp, dst_comp); + } else { + GLSLF(3, weights_%i[y*ws_stride[%i] + x] += w_sum; ,dst_comp, dst_comp); + GLSLF(3, sums_%i[y*ws_stride[%i] + x] += sum; ,dst_comp, dst_comp); + } + GLSLC(2, } ); + GLSLC(1, } ); +} + +typedef struct HorizontalPushData { + VkDeviceAddress integral_data; + VkDeviceAddress state_data; + int32_t xoffs[TYPE_ELEMS]; + int32_t yoffs[TYPE_ELEMS]; + uint32_t width[4]; + uint32_t height[4]; + uint32_t ws_stride[4]; + int32_t patch_size[4]; + float strength[4]; + uint32_t int_stride; +} HorizontalPushData; + +static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec, + FFVulkanPipeline *pl, FFVkSPIRVShader *shd, + VkSampler sampler, FFVkSPIRVCompiler *spv, + int width, int height, int t, + const AVPixFmtDescriptor *desc, + int planes, int *nb_rows) +{ + int err; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanDescriptorSetBinding *desc_set; + int max_dim = FFMAX(width, height); + uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0]; + int max_shm = vkctx->props.properties.limits.maxComputeSharedMemorySize; + int wg_size, wg_rows; + + /* Round the max workgroup size to the previous power of two */ + max_wg = 1 << (31 - ff_clz(max_wg)); + wg_size = max_wg; + wg_rows = 1; + + if (max_wg > max_dim) { + wg_size = max_wg / (max_wg / max_dim); + } else if (max_wg < max_dim) { + /* First, make it fit */ + while (wg_size*wg_rows < max_dim) + wg_rows++; + + /* Second, make sure there's enough shared memory */ + while ((wg_size * TYPE_SIZE + TYPE_SIZE + 2*4) > max_shm) { + wg_size >>= 1; + wg_rows++; + } + } + + RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0)); + ff_vk_shader_set_compute_sizes(shd, wg_size, 1, 1); + *nb_rows = wg_rows; + + if (t > 1) + GLSLC(0, #extension GL_EXT_shader_atomic_float : require ); + GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require ); + GLSLC(0, #pragma use_vulkan_memory_model ); + GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable ); + GLSLC(0, ); + GLSLF(0, #define N_ROWS %i ,*nb_rows); + GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x) ); + GLSLF(0, #define LG_WG_SIZE %i ,ff_log2(shd->local_size[0])); + GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE) ); + GLSLF(0, #define DTYPE %s ,TYPE_NAME); + GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE); + GLSLC(0, ); + GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent buffer DataBuffer { ); + GLSLC(1, DTYPE v[]; ); + GLSLC(0, }; ); + GLSLC(0, ); + GLSLC(0, layout(buffer_reference) buffer StateData; ); + GLSLC(0, ); + GLSLC(0, layout(push_constant, std430) uniform pushConstants { ); + GLSLC(1, coherent DataBuffer integral_data; ); + GLSLC(1, StateData state; ); + GLSLF(1, uint xoffs[%i]; ,TYPE_ELEMS); + GLSLF(1, uint yoffs[%i]; ,TYPE_ELEMS); + GLSLC(1, uvec4 width; ); + GLSLC(1, uvec4 height; ); + GLSLC(1, uvec4 ws_stride; ); + GLSLC(1, ivec4 patch_size; ); + GLSLC(1, vec4 strength; ); + GLSLC(1, uint int_stride; ); + GLSLC(0, }; ); + GLSLC(0, ); + + ff_vk_add_push_constant(pl, 0, sizeof(HorizontalPushData), VK_SHADER_STAGE_COMPUTE_BIT); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "input_img", + .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .dimensions = 2, + .elems = planes, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .samplers = DUP_SAMPLER(sampler), + }, + { + .name = "weights_buffer_0", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_0[];", + }, + { + .name = "sums_buffer_0", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_0[];", + }, + { + .name = "weights_buffer_1", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_1[];", + }, + { + .name = "sums_buffer_1", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_1[];", + }, + { + .name = "weights_buffer_2", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_2[];", + }, + { + .name = "sums_buffer_2", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_2[];", + }, + { + .name = "weights_buffer_3", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_3[];", + }, + { + .name = "sums_buffer_3", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_3[];", + }, + }; + RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0)); + + GLSLD( ff_source_prefix_sum_comp ); + GLSLC(0, ); + GLSLC(0, void main() ); + GLSLC(0, { ); + GLSLC(1, uint64_t offset; ); + GLSLC(1, DataBuffer dst; ); + GLSLC(1, float s1; ); + GLSLC(1, DTYPE s2; ); + GLSLC(1, int r; ); + GLSLC(1, int x; ); + GLSLC(1, int y; ); + GLSLC(1, int p; ); + GLSLC(0, ); + GLSLC(1, DTYPE a; ); + GLSLC(1, DTYPE b; ); + GLSLC(1, DTYPE c; ); + GLSLC(1, DTYPE d; ); + GLSLC(0, ); + GLSLC(1, DTYPE patch_diff; ); + if (TYPE_ELEMS == 4) { + GLSLC(1, vec4 src; ); + GLSLC(1, vec4 w; ); + } else { + GLSLC(1, vec4 src[4]; ); + GLSLC(1, vec4 w[4]; ); + } + GLSLC(1, float w_sum; ); + GLSLC(1, float sum; ); + GLSLC(0, ); + GLSLC(1, bool lt; ); + GLSLC(1, bool gt; ); + GLSLC(0, ); + + for (int i = 0; i < desc->nb_components; i++) { + int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8); + if (width > height) { + insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off); + insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off); + insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off); + } else { + insert_vertical_pass(shd, *nb_rows, 1, desc->comp[i].plane, off); + insert_horizontal_pass(shd, *nb_rows, 0, desc->comp[i].plane, off); + insert_weights_pass(shd, *nb_rows, 1, t, i, desc->comp[i].plane, off); + } + } + + GLSLC(0, } ); + + RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main")); + + RET(ff_vk_init_compute_pipeline(vkctx, pl, shd)); + RET(ff_vk_exec_pipeline_register(vkctx, exec, pl)); + + return 0; + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +typedef struct DenoisePushData { + uint32_t ws_stride[4]; +} DenoisePushData; + +static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec, + FFVulkanPipeline *pl, FFVkSPIRVShader *shd, + VkSampler sampler, FFVkSPIRVCompiler *spv, + const AVPixFmtDescriptor *desc, int planes) +{ + int err; + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + FFVulkanDescriptorSetBinding *desc_set; + + RET(ff_vk_shader_init(pl, shd, "nlmeans_denoise", + VK_SHADER_STAGE_COMPUTE_BIT, 0)); + + ff_vk_shader_set_compute_sizes(shd, 32, 32, 1); + + GLSLC(0, layout(push_constant, std430) uniform pushConstants { ); + GLSLC(1, uvec4 ws_stride; ); + GLSLC(0, }; ); + + ff_vk_add_push_constant(pl, 0, sizeof(DenoisePushData), VK_SHADER_STAGE_COMPUTE_BIT); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "input_img", + .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .dimensions = 2, + .elems = planes, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .samplers = DUP_SAMPLER(sampler), + }, + { + .name = "output_img", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .mem_layout = ff_vk_shader_rep_fmt(vkctx->output_format), + .mem_quali = "writeonly", + .dimensions = 2, + .elems = planes, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "weights_buffer_0", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_0[];", + }, + { + .name = "sums_buffer_0", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_0[];", + }, + { + .name = "weights_buffer_1", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_1[];", + }, + { + .name = "sums_buffer_1", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_1[];", + }, + { + .name = "weights_buffer_2", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_2[];", + }, + { + .name = "sums_buffer_2", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_2[];", + }, + { + .name = "weights_buffer_3", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float weights_3[];", + }, + { + .name = "sums_buffer_3", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "float sums_3[];", + }, + }; + RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 2 + 2*desc->nb_components, 0, 0)); + + GLSLC(0, void main() ); + GLSLC(0, { ); + GLSLC(1, ivec2 size; ); + GLSLC(1, const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); ); + GLSLC(0, ); + GLSLC(1, float w_sum; ); + GLSLC(1, float sum; ); + GLSLC(1, vec4 src; ); + GLSLC(1, vec4 r; ); + GLSLC(0, ); + + for (int i = 0; i < planes; i++) { + GLSLF(1, src = texture(input_img[%i], pos); ,i); + for (int c = 0; c < desc->nb_components; c++) { + if (desc->comp[c].plane == i) { + int off = desc->comp[c].offset / (FFALIGN(desc->comp[c].depth, 8)/8); + GLSLF(1, w_sum = weights_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c); + GLSLF(1, sum = sums_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c); + GLSLF(1, r[%i] = (sum + src[%i]*255) / (1.0 + w_sum) / 255; ,off, off); + GLSLC(0, ); + } + } + GLSLF(1, imageStore(output_img[%i], pos, r); ,i); + GLSLC(0, ); + } + + GLSLC(0, } ); + + RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque)); + RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main")); + + RET(ff_vk_init_compute_pipeline(vkctx, pl, shd)); + RET(ff_vk_exec_pipeline_register(vkctx, exec, pl)); + + return 0; + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + +static av_cold int init_filter(AVFilterContext *ctx) +{ + int rad, err; + int xcnt = 0, ycnt = 0; + NLMeansVulkanContext *s = ctx->priv; + FFVulkanContext *vkctx = &s->vkctx; + const int planes = av_pix_fmt_count_planes(s->vkctx.output_format); + FFVkSPIRVCompiler *spv; + + const AVPixFmtDescriptor *desc; + desc = av_pix_fmt_desc_get(vkctx->output_format); + if (!desc) + return AVERROR(EINVAL); + + if (!(s->opts.r & 1)) { + s->opts.r |= 1; + av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to %i", + s->opts.r); + } + + if (!(s->opts.p & 1)) { + s->opts.p |= 1; + av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i", + s->opts.p); + } + + for (int i = 0; i < 4; i++) { + double str = (s->opts.sc[i] > 1.0) ? s->opts.sc[i] : s->opts.s; + int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p); + str = 10.0f*str; + str *= -str; + str = 255.0*255.0 / str; + s->strength[i] = str; + if (!(ps & 1)) { + ps |= 1; + av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i", + ps); + } + s->patch[i] = ps / 2; + } + + rad = s->opts.r/2; + s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1; + s->xoffsets = av_malloc(s->nb_offsets*sizeof(*s->xoffsets)); + s->yoffsets = av_malloc(s->nb_offsets*sizeof(*s->yoffsets)); + s->nb_offsets = 0; + + for (int x = -rad; x <= rad; x++) { + for (int y = -rad; y <= rad; y++) { + if (!x && !y) + continue; + + s->xoffsets[xcnt++] = x; + s->yoffsets[ycnt++] = y; + s->nb_offsets++; + } + } + + s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS)); + if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) { + av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, " + "disabling dispatch parallelism\n"); + s->opts.t = 1; + } + + if (!vkctx->feats_12.vulkanMemoryModel) { + av_log(ctx, AV_LOG_ERROR, "Device doesn't support the Vulkan memory model!"); + return AVERROR(EINVAL);; + } + + spv = ff_vk_spirv_init(); + if (!spv) { + av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); + return AVERROR_EXTERNAL; + } + + ff_vk_qf_init(vkctx, &s->qf, VK_QUEUE_COMPUTE_BIT); + RET(ff_vk_exec_pool_init(vkctx, &s->qf, &s->e, 1, 0, 0, 0, NULL)); + RET(ff_vk_init_sampler(vkctx, &s->sampler, 1, VK_FILTER_NEAREST)); + + RET(init_weights_pipeline(vkctx, &s->e, &s->pl_weights, &s->shd_weights, s->sampler, + spv, s->vkctx.output_width, s->vkctx.output_height, + s->opts.t, desc, planes, &s->pl_weights_rows)); + + RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler, + spv, desc, planes)); + + av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches, %i parallel\n", + s->nb_offsets, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS) + 1, s->opts.t); + + s->initialized = 1; + + return 0; + +fail: + if (spv) + spv->uninit(&spv); + + return err; +} + +static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec, + FFVkBuffer *ws_vk, uint32_t ws_stride[4]) +{ + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + VkBufferMemoryBarrier2 buf_bar[8]; + int nb_buf_bar = 0; + + /* Denoise pass pipeline */ + ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_denoise); + + /* Push data */ + ff_vk_update_push_exec(vkctx, exec, &s->pl_denoise, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(DenoisePushData), &(DenoisePushData) { + { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] }, + }); + + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = ws_vk->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = ws_vk->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = ws_vk->buf, + .size = ws_vk->size, + .offset = 0, + }; + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + ws_vk->stage = buf_bar[0].dstStageMask; + ws_vk->access = buf_bar[0].dstAccessMask; + + /* End of denoise pass */ + vk->CmdDispatch(exec->buf, + FFALIGN(vkctx->output_width, s->pl_denoise.wg_size[0])/s->pl_denoise.wg_size[0], + FFALIGN(vkctx->output_height, s->pl_denoise.wg_size[1])/s->pl_denoise.wg_size[1], + 1); + + return 0; +} + +static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) +{ + int err; + AVFrame *out = NULL; + AVFilterContext *ctx = link->dst; + NLMeansVulkanContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + + const AVPixFmtDescriptor *desc; + int plane_widths[4]; + int plane_heights[4]; + + /* Integral */ + AVBufferRef *state_buf; + FFVkBuffer *state_vk; + AVBufferRef *integral_buf; + FFVkBuffer *integral_vk; + uint32_t int_stride; + size_t int_size; + size_t state_size; + int t_offset = 0; + + /* Weights/sums */ + AVBufferRef *ws_buf; + FFVkBuffer *ws_vk; + VkDeviceAddress weights_addr[4]; + VkDeviceAddress sums_addr[4]; + uint32_t ws_stride[4]; + size_t ws_size[4]; + size_t ws_total_size = 0; + + FFVkExecContext *exec; + VkImageView in_views[AV_NUM_DATA_POINTERS]; + VkImageView out_views[AV_NUM_DATA_POINTERS]; + VkImageMemoryBarrier2 img_bar[8]; + int nb_img_bar = 0; + VkBufferMemoryBarrier2 buf_bar[8]; + int nb_buf_bar = 0; + + if (!s->initialized) + RET(init_filter(ctx)); + + desc = av_pix_fmt_desc_get(vkctx->output_format); + if (!desc) + return AVERROR(EINVAL); + + /* Integral image */ + int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows; + int_size = int_stride * int_stride * TYPE_SIZE; + state_size = int_stride * 3 *TYPE_SIZE; + + /* Plane dimensions */ + for (int i = 0; i < desc->nb_components; i++) { + plane_widths[i] = !i || (i == 3) ? vkctx->output_width : AV_CEIL_RSHIFT(vkctx->output_width, desc->log2_chroma_w); + plane_heights[i] = !i || (i == 3) ? vkctx->output_height : AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_w); + plane_widths[i] = FFALIGN(plane_widths[i], s->pl_denoise.wg_size[0]); + plane_heights[i] = FFALIGN(plane_heights[i], s->pl_denoise.wg_size[1]); + + ws_stride[i] = plane_widths[i]; + ws_size[i] = ws_stride[i] * plane_heights[i] * sizeof(float); + ws_total_size += ws_size[i]; + } + + /* Buffers */ + err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, &integral_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, + s->opts.t * int_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + integral_vk = (FFVkBuffer *)integral_buf->data; + + err = ff_vk_get_pooled_buffer(&s->vkctx, &s->state_buf_pool, &state_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, + s->opts.t * state_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + state_vk = (FFVkBuffer *)state_buf->data; + + err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, + ws_total_size * 2, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (err < 0) + return err; + ws_vk = (FFVkBuffer *)ws_buf->data; + + weights_addr[0] = ws_vk->address; + sums_addr[0] = ws_vk->address + ws_total_size; + for (int i = 1; i < desc->nb_components; i++) { + weights_addr[i] = weights_addr[i - 1] + ws_size[i - 1]; + sums_addr[i] = sums_addr[i - 1] + ws_size[i - 1]; + } + + /* Output frame */ + out = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!out) { + err = AVERROR(ENOMEM); + goto fail; + } + + /* Execution context */ + exec = ff_vk_exec_get(&s->e); + ff_vk_exec_start(vkctx, exec); + + /* Dependencies */ + RET(ff_vk_exec_add_dep_frame(vkctx, exec, in, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_exec_add_dep_frame(vkctx, exec, out, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0)); + RET(ff_vk_exec_add_dep_buf(vkctx, exec, &state_buf, 1, 0)); + RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf, 1, 0)); + + /* Input frame prep */ + RET(ff_vk_create_imageviews(vkctx, exec, in_views, in)); + ff_vk_update_descriptor_img_array(vkctx, &s->pl_weights, exec, in, in_views, 0, 0, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + s->sampler); + ff_vk_frame_barrier(vkctx, exec, in, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + /* Output frame prep */ + RET(ff_vk_create_imageviews(vkctx, exec, out_views, out)); + ff_vk_frame_barrier(vkctx, exec, out, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = ws_vk->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT, + .srcAccessMask = ws_vk->access, + .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = ws_vk->buf, + .size = ws_vk->size, + .offset = 0, + }; + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + ws_vk->stage = buf_bar[0].dstStageMask; + ws_vk->access = buf_bar[0].dstAccessMask; + + /* Weights/sums buffer zeroing */ + vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0); + + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = ws_vk->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = ws_vk->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = ws_vk->buf, + .size = ws_vk->size, + .offset = 0, + }; + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + ws_vk->stage = buf_bar[0].dstStageMask; + ws_vk->access = buf_bar[0].dstAccessMask; + + /* Update weights descriptors */ + ff_vk_update_descriptor_img_array(vkctx, &s->pl_weights, exec, in, in_views, 0, 0, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + s->sampler); + for (int i = 0; i < desc->nb_components; i++) { + RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, exec, 0, 1 + i*2 + 0, 0, + weights_addr[i], ws_size[i], + VK_FORMAT_UNDEFINED)); + RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, exec, 0, 1 + i*2 + 1, 0, + sums_addr[i], ws_size[i], + VK_FORMAT_UNDEFINED)); + } + + /* Update denoise descriptors */ + ff_vk_update_descriptor_img_array(vkctx, &s->pl_denoise, exec, in, in_views, 0, 0, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + s->sampler); + ff_vk_update_descriptor_img_array(vkctx, &s->pl_denoise, exec, out, out_views, 0, 1, + VK_IMAGE_LAYOUT_GENERAL, s->sampler); + for (int i = 0; i < desc->nb_components; i++) { + RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_denoise, exec, 0, 2 + i*2 + 0, 0, + weights_addr[i], ws_size[i], + VK_FORMAT_UNDEFINED)); + RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_denoise, exec, 0, 2 + i*2 + 1, 0, + sums_addr[i], ws_size[i], + VK_FORMAT_UNDEFINED)); + } + + /* Weights pipeline */ + ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights); + + for (int i = 0; i < s->nb_offsets; i += TYPE_ELEMS) { + int *xoffs = s->xoffsets + i; + int *yoffs = s->yoffsets + i; + HorizontalPushData pd = { + integral_vk->address + t_offset*int_size, + state_vk->address + t_offset*state_size, + { 0 }, + { 0 }, + { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] }, + { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] }, + { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] }, + { s->patch[0], s->patch[1], s->patch[2], s->patch[3] }, + { s->strength[0], s->strength[1], s->strength[2], s->strength[2], }, + int_stride, + }; + + memcpy(pd.xoffs, xoffs, sizeof(pd.xoffs)); + memcpy(pd.yoffs, yoffs, sizeof(pd.yoffs)); + + /* Put a barrier once we run out of parallelism buffers */ + if (!t_offset) { + nb_buf_bar = 0; + /* Buffer prep/sync */ + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = integral_vk->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = integral_vk->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = integral_vk->buf, + .size = integral_vk->size, + .offset = 0, + }; + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = state_vk->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = state_vk->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = state_vk->buf, + .size = state_vk->size, + .offset = 0, + }; + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + integral_vk->stage = buf_bar[0].dstStageMask; + integral_vk->access = buf_bar[0].dstAccessMask; + state_vk->stage = buf_bar[1].dstStageMask; + state_vk->access = buf_bar[1].dstAccessMask; + } + t_offset = (t_offset + 1) % s->opts.t; + + /* Push data */ + ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + /* End of horizontal pass */ + vk->CmdDispatch(exec->buf, 1, 1, 1); + } + + RET(denoise_pass(s, exec, ws_vk, ws_stride)); + + err = ff_vk_exec_submit(vkctx, exec); + if (err < 0) + return err; + + err = av_frame_copy_props(out, in); + if (err < 0) + goto fail; + + av_frame_free(&in); + + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&in); + av_frame_free(&out); + return err; +} + +static void nlmeans_vulkan_uninit(AVFilterContext *avctx) +{ + NLMeansVulkanContext *s = avctx->priv; + FFVulkanContext *vkctx = &s->vkctx; + FFVulkanFunctions *vk = &vkctx->vkfn; + + ff_vk_exec_pool_free(vkctx, &s->e); + ff_vk_pipeline_free(vkctx, &s->pl_weights); + ff_vk_shader_free(vkctx, &s->shd_weights); + ff_vk_pipeline_free(vkctx, &s->pl_denoise); + ff_vk_shader_free(vkctx, &s->shd_denoise); + + av_buffer_pool_uninit(&s->integral_buf_pool); + av_buffer_pool_uninit(&s->state_buf_pool); + av_buffer_pool_uninit(&s->ws_buf_pool); + + if (s->sampler) + vk->DestroySampler(vkctx->hwctx->act_dev, s->sampler, + vkctx->hwctx->alloc); + + ff_vk_uninit(&s->vkctx); + + s->initialized = 0; +} + +#define OFFSET(x) offsetof(NLMeansVulkanContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) +static const AVOption nlmeans_vulkan_options[] = { + { "s", "denoising strength for all components", OFFSET(opts.s), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS }, + { "p", "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS }, + { "r", "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS }, + { "t", "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 36 }, 1, 168, FLAGS }, + + { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS }, + { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS }, + { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS }, + { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS }, + + { "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + { "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + { "p3", "patch size for component 3", OFFSET(opts.pc[2]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + { "p4", "patch size for component 4", OFFSET(opts.pc[3]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS }, + + { NULL } +}; + +AVFILTER_DEFINE_CLASS(nlmeans_vulkan); + +static const AVFilterPad nlmeans_vulkan_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = &nlmeans_vulkan_filter_frame, + .config_props = &ff_vk_filter_config_input, + }, +}; + +static const AVFilterPad nlmeans_vulkan_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = &ff_vk_filter_config_output, + }, +}; + +const AVFilter ff_vf_nlmeans_vulkan = { + .name = "nlmeans_vulkan", + .description = NULL_IF_CONFIG_SMALL("Non-local means denoiser (Vulkan)"), + .priv_size = sizeof(NLMeansVulkanContext), + .init = &ff_vk_filter_init, + .uninit = &nlmeans_vulkan_uninit, + FILTER_INPUTS(nlmeans_vulkan_inputs), + FILTER_OUTPUTS(nlmeans_vulkan_outputs), + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_VULKAN), + .priv_class = &nlmeans_vulkan_class, + .flags = AVFILTER_FLAG_HWDEVICE, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vulkan/prefix_sum.comp b/libavfilter/vulkan/prefix_sum.comp new file mode 100644 index 0000000000..9147cd82fb --- /dev/null +++ b/libavfilter/vulkan/prefix_sum.comp @@ -0,0 +1,151 @@ +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require + +#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire +#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease + +// These correspond to X, A, P respectively in the prefix sum paper. +#define FLAG_NOT_READY 0u +#define FLAG_AGGREGATE_READY 1u +#define FLAG_PREFIX_READY 2u + +layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData { + DTYPE aggregate; + DTYPE prefix; + uint flag; +}; + +shared DTYPE sh_scratch[WG_SIZE]; +shared DTYPE sh_prefix; +shared uint sh_part_ix; +shared uint sh_flag; + +void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride) +{ + DTYPE local[N_ROWS]; + // Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper). + if (gl_GlobalInvocationID.x == 0) + sh_part_ix = gl_WorkGroupID.x; +// sh_part_ix = atomicAdd(part_counter, 1); + + barrier(); + uint part_ix = sh_part_ix; + + uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; + + // TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better) + local[0] = src.v[ix*src_stride]; + for (uint i = 1; i < N_ROWS; i++) + local[i] = local[i - 1] + src.v[(ix + i)*src_stride]; + + DTYPE agg = local[N_ROWS - 1]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1u << i)) + agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)]; + barrier(); + + sh_scratch[gl_LocalInvocationID.x] = agg; + } + + // Publish aggregate for this partition + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + state[part_ix].aggregate = agg; + if (part_ix == 0) + state[0].prefix = agg; + } + + // Write flag with release semantics + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY; + atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE); + } + + DTYPE exclusive = DTYPE(0); + if (part_ix != 0) { + // step 4 of paper: decoupled lookback + uint look_back_ix = part_ix - 1; + + DTYPE their_agg; + uint their_ix = 0; + while (true) { + // Read flag with acquire semantics. + if (gl_LocalInvocationID.x == WG_SIZE - 1) + sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE); + + // The flag load is done only in the last thread. However, because the + // translation of memoryBarrierBuffer to Metal requires uniform control + // flow, we broadcast it to all threads. + barrier(); + + uint flag = sh_flag; + barrier(); + + if (flag == FLAG_PREFIX_READY) { + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + DTYPE their_prefix = state[look_back_ix].prefix; + exclusive = their_prefix + exclusive; + } + break; + } else if (flag == FLAG_AGGREGATE_READY) { + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + their_agg = state[look_back_ix].aggregate; + exclusive = their_agg + exclusive; + } + look_back_ix--; + their_ix = 0; + continue; + } // else spins + + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + // Unfortunately there's no guarantee of forward progress of other + // workgroups, so compute a bit of the aggregate before trying again. + // In the worst case, spinning stops when the aggregate is complete. + DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride]; + if (their_ix == 0) + their_agg = m; + else + their_agg += m; + + their_ix++; + if (their_ix == PARTITION_SIZE) { + exclusive = their_agg + exclusive; + if (look_back_ix == 0) { + sh_flag = FLAG_PREFIX_READY; + } else { + look_back_ix--; + their_ix = 0; + } + } + } + barrier(); + flag = sh_flag; + barrier(); + if (flag == FLAG_PREFIX_READY) + break; + } + + // step 5 of paper: compute inclusive prefix + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + DTYPE inclusive_prefix = exclusive + agg; + sh_prefix = exclusive; + state[part_ix].prefix = inclusive_prefix; + } + + if (gl_LocalInvocationID.x == WG_SIZE - 1) + atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE); + } + + barrier(); + if (part_ix != 0) + exclusive = sh_prefix; + + DTYPE row = exclusive; + if (gl_LocalInvocationID.x > 0) + row += sh_scratch[gl_LocalInvocationID.x - 1]; + + // note - may overwrite + for (uint i = 0; i < N_ROWS; i++) + dst.v[(ix + i)*dst_stride] = row + local[i]; +} diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h index 32f466a933..58a625dd65 100644 --- a/libavutil/vulkan_functions.h +++ b/libavutil/vulkan_functions.h @@ -133,6 +133,7 @@ typedef enum FFVulkanExtensions { MACRO(1, 1, FF_VK_EXT_NO_FLAG, CreateBuffer) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, BindBufferMemory) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, GetBufferDeviceAddress) \ + MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdFillBuffer) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, DestroyBuffer) \ \ /* Image */ \ |