30 #define TYPE_NAME "vec4"
32 #define TYPE_SIZE (TYPE_ELEMS*4)
33 #define TYPE_BLOCK_ELEMS 16
34 #define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
84 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
91 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
92 GLSLC(1, DTYPE v[]; );
97 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN)
buffer BlockBuffer { );
100 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
103 GLSLC(1, vec4 strength; );
104 GLSLC(1, uvec4 comp_off; );
105 GLSLC(1, uvec4 comp_plane; );
106 GLSLC(1, DataBuffer integral_base; );
107 GLSLC(1, uint64_t integral_size; );
108 GLSLC(1, uint64_t int_stride; );
109 GLSLC(1, uint xyoffs_start; );
110 GLSLC(1, uint nb_components; );
115 VK_SHADER_STAGE_COMPUTE_BIT);
127 void *spv_opaque =
NULL;
131 shd = shd_horizontal;
133 VK_SHADER_STAGE_COMPUTE_BIT,
134 (
const char *[]) {
"GL_EXT_buffer_reference",
135 "GL_EXT_buffer_reference2" }, 2,
145 GLSLC(1, BlockBuffer b_dst; );
147 GLSLC(1, DTYPE s2; );
148 GLSLC(1, DTYPE prefix_sum; );
153 GLSLC(1, DataBuffer integral_data; );
155 GLSLC(1, uint c_plane; );
157 GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.y); );
158 GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z); );
160 GLSLC(1,
if (strength[comp_idx] == 0.0) );
163 GLSLC(1,
offset = integral_size * (invoc_idx * nb_components + comp_idx); );
164 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
166 GLSLC(1, c_plane = comp_plane[comp_idx]; );
168 GLSLC(1,
pos.y =
int(gl_GlobalInvocationID.x); );
170 GLSLC(2, prefix_sum = DTYPE(0); );
171 GLSLC(2, offset = int_stride * uint64_t(pos.y); );
172 GLSLC(2, b_dst = BlockBuffer(uint64_t(integral_data) + offset); );
174 GLSLC(2, for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) { );
175 GLSLC(3, block = b_dst.v[k]; );
176 GLSLC(3, for (o = 0; o < T_BLOCK_ELEMS; o++) { );
177 GLSLC(4, s2 = block.data[o]; );
178 GLSLC(4, block.data[o] = s2 + prefix_sum; );
179 GLSLC(4, prefix_sum += s2; );
181 GLSLC(3, b_dst.v[k] = block; );
186 RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
193 VK_SHADER_STAGE_COMPUTE_BIT,
194 (
const char *[]) {
"GL_EXT_buffer_reference",
195 "GL_EXT_buffer_reference2" }, 2,
203 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
205 .mem_quali =
"readonly",
208 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
215 .name =
"xyoffsets_buffer",
216 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
217 .mem_quali =
"readonly",
218 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
219 .buf_content =
"ivec2 xyoffsets[];",
229 GLSLC(1,
float s1; );
230 GLSLC(1, DTYPE s2; );
231 GLSLC(1, DTYPE prefix_sum; );
234 GLSLC(1, ivec2 pos_off; );
236 GLSLC(1, DataBuffer integral_data; );
239 GLSLC(1, uint c_off; );
240 GLSLC(1, uint c_plane; );
242 GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.y); );
243 GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z); );
245 GLSLC(1,
if (strength[comp_idx] == 0.0) );
248 GLSLC(1,
offset = integral_size * (invoc_idx * nb_components + comp_idx); );
249 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
253 GLSLC(1, c_off = comp_off[comp_idx]; );
254 GLSLC(1, c_plane = comp_plane[comp_idx]; );
255 GLSLC(1,
size = imageSize(input_img[c_plane]); );
257 GLSLC(1,
pos.x =
int(gl_GlobalInvocationID.x); );
259 GLSLC(2, prefix_sum = DTYPE(0); );
260 GLSLC(2, for (pos.y = 0; pos.y < height[c_plane]; pos.y++) { );
261 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
262 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
263 GLSLC(4, s1 = imageLoad(input_img[c_plane], pos)[c_off]; );
264 for (int i = 0; i < TYPE_ELEMS; i++) {
265 GLSLF(4, pos_off = pos + offs[%i]; ,i);
266 GLSLC(4, if (!IS_WITHIN(uvec2(pos_off), size)) );
267 GLSLF(5, s2[%i] = s1; ,i);
269 GLSLF(5, s2[%i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,i);
271 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
272 GLSLC(3, dst.v[pos.x] = s2 + prefix_sum; );
273 GLSLC(3, prefix_sum += s2; );
278 RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
285 spv->free_shader(spv, &spv_opaque);
316 void *spv_opaque =
NULL;
320 VK_SHADER_STAGE_COMPUTE_BIT,
321 (
const char *[]) {
"GL_EXT_buffer_reference",
322 "GL_EXT_buffer_reference2" }, 2,
326 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
331 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
332 GLSLC(1, DTYPE v[]; );
334 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
337 GLSLC(1, uvec4 ws_offset; );
338 GLSLC(1, uvec4 ws_stride; );
339 GLSLC(1, ivec4 patch_size; );
340 GLSLC(1, vec4 strength; );
341 GLSLC(1, uvec4 comp_off; );
342 GLSLC(1, uvec4 comp_plane; );
343 GLSLC(1, DataBuffer integral_base; );
344 GLSLC(1, uint64_t integral_size; );
345 GLSLC(1, uint64_t int_stride; );
346 GLSLC(1, uint xyoffs_start; );
347 GLSLC(1, uint ws_count; );
348 GLSLC(1, uint nb_components; );
353 VK_SHADER_STAGE_COMPUTE_BIT);
358 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
360 .mem_quali =
"readonly",
363 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
366 .name =
"weights_buffer",
367 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
368 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
369 .buf_content =
"float weights[];",
372 .name =
"sums_buffer",
373 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
374 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
375 .buf_content =
"float sums[];",
382 .
name =
"xyoffsets_buffer",
383 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
384 .mem_quali =
"readonly",
385 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
386 .buf_content =
"ivec2 xyoffsets[];",
398 GLSLC(1, ivec2 pos_off; );
402 GLSLC(1, DataBuffer integral_data; );
405 GLSLC(1, uint c_off; );
406 GLSLC(1, uint c_plane; );
407 GLSLC(1, uint ws_off; );
409 GLSLC(1,
pos = ivec2(gl_GlobalInvocationID.xy); );
410 GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.z) %% nb_components; );
411 GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components; );
413 GLSLC(1, c_off = comp_off[comp_idx]; );
414 GLSLC(1, c_plane = comp_plane[comp_idx]; );
415 GLSLC(1,
p = patch_size[comp_idx]; );
416 GLSLC(1,
s = strength[comp_idx]; );
420 GLSLC(1,
offset = integral_size * (invoc_idx * nb_components + comp_idx); );
421 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
425 GLSLC(1, ws_off = ws_count * invoc_idx + ws_offset[comp_idx] +
pos.y * ws_stride[comp_idx] +
pos.x; );
426 GLSLC(1,
size = imageSize(input_img[c_plane]); );
433 GLSLC(1, DTYPE patch_diff; );
436 GLSLC(1,
float w_sum; );
437 GLSLC(1,
float sum; );
439 for (
int i = 0;
i < 4;
i++) {
441 GLSLC(1,
if (!IS_WITHIN(uvec2(pos_off),
size)) );
442 GLSLF(2,
src[%
i] = imageLoad(input_img[c_plane],
pos)[c_off]; ,
i);
444 GLSLF(2,
src[%
i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,
i);
456 GLSLC(1, patch_diff = d +
a -
b -
c; );
458 GLSLC(1, w_sum =
w[0] +
w[1] +
w[2] +
w[3]; );
462 GLSLC(1, sums[ws_off] += sum; );
465 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
494 void *spv_opaque =
NULL;
497 VK_SHADER_STAGE_COMPUTE_BIT,
498 (
const char *[]) {
"GL_EXT_buffer_reference",
499 "GL_EXT_buffer_reference2" }, 2,
503 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
504 GLSLC(1, uvec4 comp_off; );
505 GLSLC(1, uvec4 comp_plane; );
506 GLSLC(1, uvec4 ws_offset; );
507 GLSLC(1, uvec4 ws_stride; );
508 GLSLC(1, uint32_t ws_count; );
509 GLSLC(1, uint32_t t; );
510 GLSLC(1, uint32_t nb_components; );
514 VK_SHADER_STAGE_COMPUTE_BIT);
519 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
521 .mem_quali =
"readonly",
524 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
527 .name =
"output_img",
528 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
530 .mem_quali =
"writeonly",
533 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
540 .
name =
"weights_buffer",
541 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
542 .mem_quali =
"readonly",
543 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
544 .buf_content =
"float weights[];",
547 .name =
"sums_buffer",
548 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
549 .mem_quali =
"readonly",
550 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
551 .buf_content =
"float sums[];",
559 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
560 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
561 GLSLC(1,
const uvec2
size = imageSize(output_img[plane]); );
563 GLSLC(1, uint c_off; );
564 GLSLC(1, uint c_plane; );
565 GLSLC(1, uint ws_off; );
567 GLSLC(1,
float w_sum; );
568 GLSLC(1,
float sum; );
571 GLSLC(1, uint invoc_idx; );
572 GLSLC(1, uint comp_idx; );
577 GLSLC(1,
src = imageLoad(input_img[plane],
pos); );
578 GLSLC(1,
for (comp_idx = 0; comp_idx < nb_components; comp_idx++) { );
579 GLSLC(2,
if (plane == comp_plane[comp_idx]) { );
580 GLSLC(3, w_sum = 0.0; );
581 GLSLC(3, sum = 0.0; );
582 GLSLC(3,
for (invoc_idx = 0; invoc_idx < t; invoc_idx++) { );
583 GLSLC(4, ws_off = ws_count * invoc_idx + ws_offset[comp_idx] +
pos.y * ws_stride[comp_idx] +
pos.x; );
585 GLSLC(4, sum += sums[ws_off]; );
587 GLSLC(3, c_off = comp_off[comp_idx]; );
588 GLSLC(3,
r[c_off] = (sum +
src[c_off] * 255) / (1.0 + w_sum) / 255; );
591 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
594 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
609 int xcnt = 0, ycnt = 0;
615 int offsets_dispatched = 0, nb_dispatches = 0;
622 if (!(
s->opts.r & 1)) {
628 if (!(
s->opts.p & 1)) {
634 for (
int i = 0;
i < 4;
i++) {
635 double str = !
isnan(
s->opts.sc[
i]) ?
s->opts.sc[
i] :
s->opts.s;
636 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
638 s->strength[
i] = 0.0;
642 str = 255.0*255.0 / str;
643 s->strength[
i] = str;
650 s->patch[
i] = ps / 2;
654 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
655 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
656 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
659 for (
int x = -rad; x <= rad; x++) {
660 for (
int y = -rad; y <= rad; y++) {
664 s->xoffsets[xcnt++] = x;
665 s->yoffsets[ycnt++] = y;
671 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
672 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
673 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
674 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
677 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
678 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
679 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
686 spv = ff_vk_spirv_init();
710 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
711 VK_FORMAT_UNDEFINED));
715 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
716 VK_FORMAT_UNDEFINED));
719 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
722 }
while (offsets_dispatched < s->nb_offsets);
725 s->nb_offsets, nb_dispatches);
737 FFVkBuffer *ws_vk, uint32_t comp_offs[4], uint32_t comp_planes[4],
738 uint32_t ws_offset[4], uint32_t ws_stride[4],
739 uint32_t ws_count, uint32_t t, uint32_t nb_components)
743 VkBufferMemoryBarrier2 buf_bar[2];
747 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
748 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
749 { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
750 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
761 VK_SHADER_STAGE_COMPUTE_BIT,
764 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
765 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
766 .srcStageMask = ws_vk->
stage,
767 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
768 .srcAccessMask = ws_vk->
access,
769 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
770 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
771 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
772 .buffer = ws_vk->
buf,
777 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
778 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
779 .pBufferMemoryBarriers = buf_bar,
780 .bufferMemoryBarrierCount = nb_buf_bar,
782 ws_vk->
stage = buf_bar[0].dstStageMask;
783 ws_vk->
access = buf_bar[0].dstAccessMask;
786 vk->CmdDispatch(exec->
buf,
808 int plane_heights[4];
810 int offsets_dispatched = 0;
821 uint32_t ws_count = 0;
822 uint32_t ws_offset[4];
823 uint32_t ws_stride[4];
829 VkImageMemoryBarrier2 img_bar[8];
831 VkBufferMemoryBarrier2 buf_bar[2];
846 for (
int i = 0;
i <
desc->nb_components;
i++) {
849 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->shd_denoise.lg_size[0]);
850 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->shd_denoise.lg_size[1]);
853 comp_planes[
i] =
desc->comp[
i].plane;
855 ws_stride[
i] = plane_widths[
i];
856 ws_offset[
i] = ws_count;
857 ws_count += ws_stride[
i] * plane_heights[
i];
860 ws_size = ws_count *
sizeof(
float);
864 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
865 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
867 int_size *
s->opts.t *
desc->nb_components,
868 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
874 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
875 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
876 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
878 ws_size *
s->
opts.t * 2,
879 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
897 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
898 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
900 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
901 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
912 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
913 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
914 VK_ACCESS_SHADER_READ_BIT,
915 VK_IMAGE_LAYOUT_GENERAL,
916 VK_QUEUE_FAMILY_IGNORED);
921 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
922 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
923 VK_ACCESS_SHADER_WRITE_BIT,
924 VK_IMAGE_LAYOUT_GENERAL,
925 VK_QUEUE_FAMILY_IGNORED);
928 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
929 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
930 .srcStageMask = ws_vk->
stage,
931 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
932 .srcAccessMask = ws_vk->
access,
933 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
934 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
935 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
936 .buffer = ws_vk->
buf,
941 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
942 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
943 .pImageMemoryBarriers = img_bar,
944 .imageMemoryBarrierCount = nb_img_bar,
945 .pBufferMemoryBarriers = buf_bar,
946 .bufferMemoryBarrierCount = nb_buf_bar,
948 ws_vk->
stage = buf_bar[0].dstStageMask;
949 ws_vk->
access = buf_bar[0].dstAccessMask;
952 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
956 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
959 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
961 ws_vk, 0, ws_size *
s->
opts.t,
962 VK_FORMAT_UNDEFINED));
964 ws_vk, ws_size *
s->
opts.t, ws_size *
s->
opts.t,
965 VK_FORMAT_UNDEFINED));
969 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
971 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
973 ws_vk, 0, ws_size *
s->
opts.t,
974 VK_FORMAT_UNDEFINED));
976 ws_vk, ws_size *
s->
opts.t, ws_size *
s->
opts.t,
977 VK_FORMAT_UNDEFINED));
980 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
984 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
985 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
986 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[3], },
987 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
988 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
991 (uint64_t)int_stride,
998 VK_SHADER_STAGE_COMPUTE_BIT,
1002 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
1003 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
1004 .srcStageMask = integral_vk->
stage,
1005 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1006 .srcAccessMask = integral_vk->
access,
1007 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1008 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1009 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1010 .buffer = integral_vk->
buf,
1011 .size = integral_vk->
size,
1014 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1015 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1016 .pBufferMemoryBarriers = buf_bar,
1017 .bufferMemoryBarrierCount = nb_buf_bar,
1019 integral_vk->
stage = buf_bar[0].dstStageMask;
1020 integral_vk->
access = buf_bar[0].dstAccessMask;
1023 vk->CmdDispatch(exec->
buf,
1025 desc->nb_components,
1030 VK_SHADER_STAGE_COMPUTE_BIT,
1031 0,
sizeof(pd), &pd);
1034 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
1035 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
1036 .srcStageMask = integral_vk->
stage,
1037 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1038 .srcAccessMask = integral_vk->
access,
1039 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
1040 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1041 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1042 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1043 .buffer = integral_vk->
buf,
1044 .size = integral_vk->
size,
1047 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1048 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1049 .pBufferMemoryBarriers = buf_bar,
1050 .bufferMemoryBarrierCount = nb_buf_bar,
1052 integral_vk->
stage = buf_bar[0].dstStageMask;
1053 integral_vk->
access = buf_bar[0].dstAccessMask;
1056 vk->CmdDispatch(exec->
buf,
1058 desc->nb_components,
1063 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
1064 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
1065 { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
1066 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
1067 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
1068 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[3], },
1069 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
1070 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
1073 (uint64_t)int_stride,
1076 desc->nb_components,
1081 VK_SHADER_STAGE_COMPUTE_BIT,
1082 0,
sizeof(wpd), &wpd);
1085 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
1086 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
1087 .srcStageMask = integral_vk->
stage,
1088 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1089 .srcAccessMask = integral_vk->
access,
1090 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
1091 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1092 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1093 .buffer = integral_vk->
buf,
1094 .size = integral_vk->
size,
1097 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
1098 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
1099 .srcStageMask = ws_vk->
stage,
1100 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1101 .srcAccessMask = ws_vk->
access,
1102 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
1103 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1104 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1105 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1106 .buffer = ws_vk->
buf,
1107 .size = ws_vk->
size,
1110 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1111 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1112 .pBufferMemoryBarriers = buf_bar,
1113 .bufferMemoryBarrierCount = nb_buf_bar,
1115 integral_vk->
stage = buf_bar[0].dstStageMask;
1116 integral_vk->
access = buf_bar[0].dstAccessMask;
1117 ws_vk->
stage = buf_bar[1].dstStageMask;
1118 ws_vk->
access = buf_bar[1].dstAccessMask;
1121 vk->CmdDispatch(exec->
buf,
1124 wg_invoc *
desc->nb_components);
1127 }
while (offsets_dispatched < s->nb_offsets);
1129 RET(
denoise_pass(
s, exec, ws_vk, comp_offs, comp_planes, ws_offset, ws_stride,
1130 ws_count,
s->opts.t,
desc->nb_components));
1174 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1175 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1215 .
p.
name =
"nlmeans_vulkan",
1217 .p.priv_class = &nlmeans_vulkan_class,