39 #define TYPE_SIZE (TYPE_ELEMS*4)
99 (uint32_t []) {
WG_SIZE, 1, 1 }, 0);
102 VK_SHADER_STAGE_COMPUTE_BIT);
113 (uint32_t []) {
WG_SIZE, 1, 1 }, 0);
116 VK_SHADER_STAGE_COMPUTE_BIT);
120 .
type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
121 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
129 .
type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
130 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
171 VK_SHADER_STAGE_COMPUTE_BIT);
175 .
type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
176 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
180 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
181 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
184 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
185 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
192 .
type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
193 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
227 VK_SHADER_STAGE_COMPUTE_BIT);
231 .
type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
232 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
236 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
237 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
245 .
type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
246 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
249 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
250 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
268 int xcnt = 0, ycnt = 0;
273 int offsets_dispatched = 0, nb_dispatches = 0;
275 if (!(
s->opts.r & 1)) {
281 if (!(
s->opts.p & 1)) {
287 for (
int i = 0;
i < 4;
i++) {
288 double str = !
isnan(
s->opts.sc[
i]) ?
s->opts.sc[
i] :
s->opts.s;
289 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
291 s->strength[
i] = 0.0;
295 str = 255.0*255.0 / str;
296 s->strength[
i] = str;
303 s->patch[
i] = ps / 2;
307 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
308 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
309 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
312 for (
int x = -rad; x <= rad; x++) {
313 for (
int y = -rad; y <= rad; y++) {
317 s->xoffsets[xcnt++] = x;
318 s->yoffsets[ycnt++] = y;
324 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
325 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
326 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
329 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
330 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
331 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
356 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
357 VK_FORMAT_UNDEFINED));
361 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
362 VK_FORMAT_UNDEFINED));
365 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
368 }
while (offsets_dispatched < s->nb_offsets);
371 s->nb_offsets, nb_dispatches);
380 FFVkBuffer *ws_vk, uint32_t comp_offs[4], uint32_t comp_planes[4],
381 uint32_t ws_offset[4], uint32_t ws_stride[4],
382 uint32_t ws_count, uint32_t t, uint32_t nb_components)
388 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
389 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
390 { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
391 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
402 VK_SHADER_STAGE_COMPUTE_BIT,
405 VkBufferMemoryBarrier2 buf_bar;
407 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
408 SHADER_STORAGE_WRITE_BIT,
409 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
411 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
412 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
413 .pBufferMemoryBarriers = &buf_bar,
414 .bufferMemoryBarrierCount = 1,
418 vk->CmdDispatch(exec->
buf,
440 int plane_heights[4];
442 int offsets_dispatched = 0;
453 uint32_t ws_count = 0;
454 uint32_t ws_offset[4];
455 uint32_t ws_stride[4];
461 VkImageMemoryBarrier2 img_bar[8];
463 VkBufferMemoryBarrier2 buf_bar[2];
478 for (
int i = 0;
i <
desc->nb_components;
i++) {
481 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->shd_denoise.lg_size[0]);
482 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->shd_denoise.lg_size[1]);
485 comp_planes[
i] =
desc->comp[
i].plane;
487 ws_stride[
i] = plane_widths[
i];
488 ws_offset[
i] = ws_count;
489 ws_count += ws_stride[
i] * plane_heights[
i];
492 ws_size = ws_count *
sizeof(
float);
496 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
497 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
499 int_size *
s->opts.t *
desc->nb_components,
500 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
506 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
507 VK_BUFFER_USAGE_TRANSFER_DST_BIT,
509 ws_size *
s->
opts.t * 2,
510 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
528 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
529 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
531 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
532 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
543 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
544 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
545 VK_ACCESS_SHADER_READ_BIT,
546 VK_IMAGE_LAYOUT_GENERAL,
547 VK_QUEUE_FAMILY_IGNORED);
552 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
553 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
554 VK_ACCESS_SHADER_WRITE_BIT,
555 VK_IMAGE_LAYOUT_GENERAL,
556 VK_QUEUE_FAMILY_IGNORED);
559 ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
560 TRANSFER_BIT, TRANSFER_WRITE_BIT, NONE_KHR,
563 ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
564 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
566 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
567 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
568 .pImageMemoryBarriers = img_bar,
569 .imageMemoryBarrierCount = nb_img_bar,
570 .pBufferMemoryBarriers = buf_bar,
571 .bufferMemoryBarrierCount = nb_buf_bar,
577 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
581 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
584 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
586 ws_vk, 0, ws_size *
s->
opts.t,
587 VK_FORMAT_UNDEFINED));
589 ws_vk, ws_size *
s->
opts.t, ws_size *
s->
opts.t,
590 VK_FORMAT_UNDEFINED));
594 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
596 VK_IMAGE_LAYOUT_GENERAL, VK_NULL_HANDLE);
598 ws_vk, 0, ws_size *
s->
opts.t,
599 VK_FORMAT_UNDEFINED));
601 ws_vk, ws_size *
s->
opts.t, ws_size *
s->
opts.t,
602 VK_FORMAT_UNDEFINED));
604 VkPipelineStageFlagBits2 ws_stage = VK_PIPELINE_STAGE_2_TRANSFER_BIT;
605 VkAccessFlagBits2 ws_access = VK_ACCESS_2_TRANSFER_WRITE_BIT;
607 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
609 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
610 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
611 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[3], },
612 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
613 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
616 (uint64_t)int_stride,
623 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
624 COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR,
626 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
627 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
628 .pBufferMemoryBarriers = buf_bar,
629 .bufferMemoryBarrierCount = nb_buf_bar,
635 VK_SHADER_STAGE_COMPUTE_BIT,
637 vk->CmdDispatch(exec->
buf,
639 s->shd_vertical.lg_size[0],
645 COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR,
646 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
647 SHADER_STORAGE_WRITE_BIT,
649 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
650 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
651 .pBufferMemoryBarriers = buf_bar,
652 .bufferMemoryBarrierCount = nb_buf_bar,
658 VK_SHADER_STAGE_COMPUTE_BIT,
660 vk->CmdDispatch(exec->
buf,
662 s->shd_horizontal.lg_size[0],
668 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
669 SHADER_STORAGE_WRITE_BIT,
670 COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
672 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
673 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
674 .srcStageMask = ws_stage,
675 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
676 .srcAccessMask = ws_access,
677 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
678 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
679 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
680 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
681 .buffer = ws_vk->
buf,
685 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
686 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
687 .pBufferMemoryBarriers = buf_bar,
688 .bufferMemoryBarrierCount = nb_buf_bar,
691 ws_stage = buf_bar[1].dstStageMask;
692 ws_access = buf_bar[1].dstAccessMask;
695 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
696 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
697 { ws_offset[0], ws_offset[1], ws_offset[2], ws_offset[3] },
698 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
699 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
700 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[3], },
701 { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
702 { comp_planes[0], comp_planes[1], comp_planes[2], comp_planes[3] },
705 (uint64_t)int_stride,
712 VK_SHADER_STAGE_COMPUTE_BIT,
713 0,
sizeof(wpd), &wpd);
714 vk->CmdDispatch(exec->
buf,
716 s->shd_weights.lg_size[0],
718 s->shd_weights.lg_size[1],
719 wg_invoc *
desc->nb_components);
722 }
while (offsets_dispatched < s->nb_offsets);
724 RET(
denoise_pass(
s, exec, ws_vk, comp_offs, comp_planes, ws_offset, ws_stride,
725 ws_count,
s->opts.t,
desc->nb_components));
769 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
770 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
810 .
p.
name =
"nlmeans_vulkan",
812 .p.priv_class = &nlmeans_vulkan_class,