30 #define TYPE_NAME "vec4"
32 #define TYPE_SIZE (TYPE_ELEMS*4)
74 GLSLF(4, s1 = texture(input_img[%
i],
pos + ivec2(%
i + %
s, %
i + %
s))[%
i];
75 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
77 GLSLF(4, s2[0] = texture(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
78 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
79 GLSLF(4, s2[1] = texture(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
80 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
81 GLSLF(4, s2[2] = texture(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
82 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
83 GLSLF(4, s2[3] = texture(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
84 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
86 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
91 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
93 GLSLC(1, barrier(); );
96 GLSLC(2, #pragma unroll(1) );
97 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
98 GLSLC(3, prefix_sum = DTYPE(0); );
99 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
100 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
102 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
104 insert_first(shd, 0,
"r", 0, plane, comp);
106 GLSLC(4, s2 = dst.v[pos.x]; );
107 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
108 GLSLC(4, prefix_sum += s2; );
117 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
118 GLSLC(1, #pragma unroll(1) );
119 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
120 GLSLC(2, psum[
r] = DTYPE(0); );
123 GLSLC(1, barrier(); );
126 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
127 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
128 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
130 GLSLC(3, #pragma unroll(1) );
131 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
133 insert_first(shd, 0,
"r", 1, plane, comp);
135 GLSLC(4, s2 = dst.v[pos.x + r]; );
136 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
137 GLSLC(4, psum[r] += s2; );
145 int t,
int dst_comp,
int plane,
int comp)
147 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
149 GLSLC(1, barrier(); );
153 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
155 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
156 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
158 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
159 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
161 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
162 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
165 GLSLC(3, a = DTYPE(0); );
166 GLSLC(3, b = DTYPE(0); );
167 GLSLC(3, c = DTYPE(0); );
168 GLSLC(3, d = DTYPE(0); );
170 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
172 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
173 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
174 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
175 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
177 GLSLC(3, if (lt == false) { );
178 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
179 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
180 GLSLC(4, a = dst.v[pos.x - p]; );
181 GLSLC(4, c = dst.v[pos.x + p]; );
182 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
183 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
184 GLSLC(4, b = dst.v[pos.x - p]; );
185 GLSLC(4, d = dst.v[pos.x + p]; );
188 GLSLC(3, patch_diff = d + a - b - c; );
189 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
190 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
191 GLSLC(3, sum = dot(w, src*255); );
194 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
195 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
197 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
198 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
204 typedef struct HorizontalPushData {
207 uint32_t ws_stride[4];
210 VkDeviceAddress integral_base;
211 uint64_t integral_size;
213 uint32_t xyoffs_start;
214 } HorizontalPushData;
226 void *spv_opaque =
NULL;
229 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
230 int wg_size, wg_rows;
236 if (max_wg > max_dim) {
238 }
else if (max_wg < max_dim) {
240 while (wg_size*wg_rows < max_dim)
249 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
250 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
255 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
256 GLSLC(1, DTYPE v[]; );
259 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
262 GLSLC(1, uvec4 ws_stride; );
263 GLSLC(1, ivec4 patch_size; );
264 GLSLC(1, vec4 strength; );
265 GLSLC(1, DataBuffer integral_base; );
266 GLSLC(1, uint64_t integral_size; );
267 GLSLC(1, uint64_t int_stride; );
268 GLSLC(1, uint xyoffs_start; );
277 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
280 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
284 .name =
"weights_buffer_0",
285 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
286 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
287 .buf_content =
"float weights_0[];",
290 .name =
"sums_buffer_0",
291 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
292 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
293 .buf_content =
"float sums_0[];",
296 .name =
"weights_buffer_1",
297 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
298 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
299 .buf_content =
"float weights_1[];",
302 .name =
"sums_buffer_1",
303 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
304 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
305 .buf_content =
"float sums_1[];",
308 .name =
"weights_buffer_2",
309 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
310 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
311 .buf_content =
"float weights_2[];",
314 .name =
"sums_buffer_2",
315 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
316 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
317 .buf_content =
"float sums_2[];",
320 .name =
"weights_buffer_3",
321 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
322 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
323 .buf_content =
"float weights_3[];",
326 .name =
"sums_buffer_3",
327 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
328 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
329 .buf_content =
"float sums_3[];",
336 .
name =
"xyoffsets_buffer",
337 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
338 .mem_quali =
"readonly",
339 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
340 .buf_content =
"ivec2 xyoffsets[];",
350 GLSLC(1,
float s1; );
351 GLSLC(1, DTYPE s2; );
352 GLSLC(1, DTYPE prefix_sum; );
353 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
358 GLSLC(1, DataBuffer integral_data; );
361 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
364 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
373 GLSLC(1, DTYPE patch_diff; );
381 GLSLC(1,
float w_sum; );
382 GLSLC(1,
float sum; );
388 for (
int i = 0;
i <
desc->nb_components;
i++) {
403 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
416 typedef struct DenoisePushData {
417 uint32_t ws_stride[4];
428 void *spv_opaque =
NULL;
432 VK_SHADER_STAGE_COMPUTE_BIT, 0));
436 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
437 GLSLC(1, uvec4 ws_stride; );
445 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
448 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
452 .name =
"output_img",
453 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
455 .mem_quali =
"writeonly",
458 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
465 .
name =
"weights_buffer_0",
466 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
467 .mem_quali =
"readonly",
468 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
469 .buf_content =
"float weights_0[];",
472 .name =
"sums_buffer_0",
473 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
474 .mem_quali =
"readonly",
475 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
476 .buf_content =
"float sums_0[];",
479 .name =
"weights_buffer_1",
480 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
481 .mem_quali =
"readonly",
482 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
483 .buf_content =
"float weights_1[];",
486 .name =
"sums_buffer_1",
487 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
488 .mem_quali =
"readonly",
489 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
490 .buf_content =
"float sums_1[];",
493 .name =
"weights_buffer_2",
494 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
495 .mem_quali =
"readonly",
496 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
497 .buf_content =
"float weights_2[];",
500 .name =
"sums_buffer_2",
501 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
502 .mem_quali =
"readonly",
503 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
504 .buf_content =
"float sums_2[];",
507 .name =
"weights_buffer_3",
508 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
509 .mem_quali =
"readonly",
510 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
511 .buf_content =
"float weights_3[];",
514 .name =
"sums_buffer_3",
515 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
516 .mem_quali =
"readonly",
517 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
518 .buf_content =
"float sums_3[];",
527 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
528 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
530 GLSLC(1,
float w_sum; );
531 GLSLC(1,
float sum; );
535 GLSLC(1,
size = imageSize(output_img[plane]); );
541 for (
int c = 0;
c <
desc->nb_components;
c++) {
546 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
550 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
553 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
569 int xcnt = 0, ycnt = 0;
575 int offsets_dispatched = 0, nb_dispatches = 0;
582 if (!(
s->opts.r & 1)) {
588 if (!(
s->opts.p & 1)) {
594 for (
int i = 0;
i < 4;
i++) {
595 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
596 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
599 str = 255.0*255.0 / str;
600 s->strength[
i] = str;
606 s->patch[
i] = ps / 2;
610 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
611 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
612 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
615 for (
int x = -rad; x <= rad; x++) {
616 for (
int y = -rad; y <= rad; y++) {
620 s->xoffsets[xcnt++] = x;
621 s->yoffsets[ycnt++] = y;
627 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
628 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
629 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
630 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
633 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
634 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
635 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
643 "disabling dispatch parallelism\n");
647 spv = ff_vk_spirv_init();
657 RET(init_weights_pipeline(vkctx, &
s->e, &
s->pl_weights, &
s->shd_weights,
s->sampler,
658 spv,
s->vkctx.output_width,
s->vkctx.output_height,
661 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->pl_denoise, &
s->shd_denoise,
s->sampler,
665 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
666 VK_FORMAT_UNDEFINED));
669 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
670 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
673 }
while (offsets_dispatched < s->nb_offsets);
676 s->nb_offsets, nb_dispatches);
692 VkBufferMemoryBarrier2 buf_bar[8];
695 DenoisePushData pd = {
696 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
706 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
707 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
708 .srcStageMask = ws_vk->
stage,
709 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
710 .srcAccessMask = ws_vk->
access,
711 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
712 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
713 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
714 .buffer = ws_vk->
buf,
719 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
720 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
721 .pBufferMemoryBarriers = buf_bar,
722 .bufferMemoryBarrierCount = nb_buf_bar,
724 ws_vk->
stage = buf_bar[0].dstStageMask;
725 ws_vk->
access = buf_bar[0].dstAccessMask;
728 vk->CmdDispatch(exec->
buf,
748 int plane_heights[4];
750 int offsets_dispatched = 0;
761 VkDeviceSize weights_offs[4];
762 VkDeviceSize sums_offs[4];
763 uint32_t ws_stride[4];
765 size_t ws_total_size = 0;
770 VkImageMemoryBarrier2 img_bar[8];
772 VkBufferMemoryBarrier2 buf_bar[8];
783 int_stride =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
784 int_size =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*int_stride;
787 for (
int i = 0;
i <
desc->nb_components;
i++) {
790 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->pl_denoise.wg_size[0]);
791 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->pl_denoise.wg_size[1]);
793 ws_stride[
i] = plane_widths[
i];
794 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
795 ws_total_size += ws_size[
i];
800 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
801 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
803 s->opts.t * int_size,
804 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
810 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
811 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
812 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
815 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
821 sums_offs[0] = ws_total_size;
822 for (
int i = 1;
i <
desc->nb_components;
i++) {
823 weights_offs[
i] = weights_offs[
i - 1] + ws_size[
i - 1];
824 sums_offs[
i] = sums_offs[
i - 1] + ws_size[
i - 1];
840 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
841 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
843 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
844 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
855 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
856 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
857 VK_ACCESS_SHADER_READ_BIT,
858 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
859 VK_QUEUE_FAMILY_IGNORED);
864 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
865 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
866 VK_ACCESS_SHADER_WRITE_BIT,
867 VK_IMAGE_LAYOUT_GENERAL,
868 VK_QUEUE_FAMILY_IGNORED);
871 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
872 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
873 .srcStageMask = ws_vk->
stage,
874 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
875 .srcAccessMask = ws_vk->
access,
876 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
877 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
878 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
879 .buffer = ws_vk->
buf,
883 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
884 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
885 .srcStageMask = integral_vk->
stage,
886 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
887 .srcAccessMask = integral_vk->
access,
888 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
889 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
890 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
891 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
892 .buffer = integral_vk->
buf,
893 .size = integral_vk->
size,
897 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
898 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
899 .pImageMemoryBarriers = img_bar,
900 .imageMemoryBarrierCount = nb_img_bar,
901 .pBufferMemoryBarriers = buf_bar,
902 .bufferMemoryBarrierCount = nb_buf_bar,
904 ws_vk->
stage = buf_bar[0].dstStageMask;
905 ws_vk->
access = buf_bar[0].dstAccessMask;
906 integral_vk->
stage = buf_bar[1].dstStageMask;
907 integral_vk->
access = buf_bar[1].dstAccessMask;
910 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
913 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
914 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
915 .srcStageMask = ws_vk->
stage,
916 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
917 .srcAccessMask = ws_vk->
access,
918 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
919 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
920 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
921 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
922 .buffer = ws_vk->
buf,
927 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
928 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
929 .pBufferMemoryBarriers = buf_bar,
930 .bufferMemoryBarrierCount = nb_buf_bar,
932 ws_vk->
stage = buf_bar[0].dstStageMask;
933 ws_vk->
access = buf_bar[0].dstAccessMask;
937 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
939 for (
int i = 0;
i <
desc->nb_components;
i++) {
941 ws_vk, weights_offs[
i], ws_size[
i],
942 VK_FORMAT_UNDEFINED));
944 ws_vk, sums_offs[
i], ws_size[
i],
945 VK_FORMAT_UNDEFINED));
950 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
953 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
954 for (
int i = 0;
i <
desc->nb_components;
i++) {
956 ws_vk, weights_offs[
i], ws_size[
i],
957 VK_FORMAT_UNDEFINED));
959 ws_vk, sums_offs[
i], ws_size[
i],
960 VK_FORMAT_UNDEFINED));
968 HorizontalPushData pd = {
969 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
970 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
971 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
972 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
973 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
976 (uint64_t)int_stride,
984 if (offsets_dispatched) {
986 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
987 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
988 .srcStageMask = integral_vk->
stage,
989 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
990 .srcAccessMask = integral_vk->
access,
991 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
992 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
993 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
994 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
995 .buffer = integral_vk->
buf,
996 .size = integral_vk->
size,
1000 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1001 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1002 .pBufferMemoryBarriers = buf_bar,
1003 .bufferMemoryBarrierCount = nb_buf_bar,
1005 integral_vk->
stage = buf_bar[1].dstStageMask;
1006 integral_vk->
access = buf_bar[1].dstAccessMask;
1010 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1013 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1016 }
while (offsets_dispatched < s->nb_offsets);
1018 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1067 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1068 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1069 static const AVOption nlmeans_vulkan_options[] = {
1090 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1094 .filter_frame = &nlmeans_vulkan_filter_frame,
1099 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1108 .
name =
"nlmeans_vulkan",
1112 .
uninit = &nlmeans_vulkan_uninit,
1116 .priv_class = &nlmeans_vulkan_class,