[FFmpeg-cvslog] nlmeans_vulkan: parallelize workgroup invocations
Lynne
git at videolan.org
Wed Oct 11 18:18:52 EEST 2023
ffmpeg | branch: master | Lynne <dev at lynne.ee> | Fri Sep 15 21:55:59 2023 +0200| [f31d0f11417067a3fc9d53085c32f4ba82b252e4] | committer: Lynne
nlmeans_vulkan: parallelize workgroup invocations
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f31d0f11417067a3fc9d53085c32f4ba82b252e4
---
libavfilter/Makefile | 3 +-
libavfilter/vf_nlmeans_vulkan.c | 438 +++++++++++++++++++------------------
libavfilter/vulkan/prefix_sum.comp | 151 -------------
3 files changed, 224 insertions(+), 368 deletions(-)
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 9a100cd665..603b532ad0 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -395,8 +395,7 @@ OBJS-$(CONFIG_MULTIPLY_FILTER) += vf_multiply.o
OBJS-$(CONFIG_NEGATE_FILTER) += vf_negate.o
OBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o
OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER) += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
-OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \
- vulkan/prefix_sum.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o
OBJS-$(CONFIG_NNEDI_FILTER) += vf_nnedi.o
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index 9741dd67ac..2b8f97d7d9 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -38,9 +38,10 @@ typedef struct NLMeansVulkanContext {
VkSampler sampler;
AVBufferPool *integral_buf_pool;
- AVBufferPool *state_buf_pool;
AVBufferPool *ws_buf_pool;
+ FFVkBuffer xyoffsets_buf;
+
int pl_weights_rows;
FFVulkanPipeline pl_weights;
FFVkSPIRVShader shd_weights;
@@ -66,107 +67,97 @@ typedef struct NLMeansVulkanContext {
extern const char *ff_source_prefix_sum_comp;
-static void insert_first(FFVkSPIRVShader *shd, int r, int horiz, int plane, int comp)
+static void insert_first(FFVkSPIRVShader *shd, int r, const char *off, int horiz, int plane, int comp)
{
- GLSLF(2, s1 = texture(input_img[%i], ivec2(x + %i, y + %i))[%i];
- ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
-
- if (TYPE_ELEMS == 4) {
- GLSLF(2, s2[0] = texture(input_img[%i], ivec2(x + %i + xoffs[0], y + %i + yoffs[0]))[%i];
- ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
- GLSLF(2, s2[1] = texture(input_img[%i], ivec2(x + %i + xoffs[1], y + %i + yoffs[1]))[%i];
- ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
- GLSLF(2, s2[2] = texture(input_img[%i], ivec2(x + %i + xoffs[2], y + %i + yoffs[2]))[%i];
- ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
- GLSLF(2, s2[3] = texture(input_img[%i], ivec2(x + %i + xoffs[3], y + %i + yoffs[3]))[%i];
- ,plane, horiz ? r : 0, !horiz ? r : 0, comp);
- } else {
- for (int i = 0; i < 16; i++) {
- GLSLF(2, s2[%i][%i] = texture(input_img[%i], ivec2(x + %i + xoffs[%i], y + %i + yoffs[%i]))[%i];
- ,i / 4, i % 4, plane, horiz ? r : 0, i, !horiz ? r : 0, i, comp);
- }
- }
-
- GLSLC(2, s2 = (s1 - s2) * (s1 - s2); );
+ GLSLF(4, s1 = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i];
+ ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+
+ GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i];
+ ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+ GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i];
+ ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+ GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i];
+ ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+ GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i];
+ ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
+
+ GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
}
static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
{
- GLSLF(1, x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
- if (!first) {
- GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
- gl_StorageSemanticsBuffer,
- gl_SemanticsAcquireRelease |
- gl_SemanticsMakeAvailable |
- gl_SemanticsMakeVisible); );
- }
- GLSLF(1, for (y = 0; y < height[%i]; y++) { ,plane);
- GLSLC(2, offset = uint64_t(int_stride)*y*T_ALIGN; );
- GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + offset); );
- GLSLC(0, );
- if (first) {
- for (int r = 0; r < nb_rows; r++) {
- insert_first(shd, r, 1, plane, comp);
- GLSLF(2, dst.v[x + %i] = s2; ,r);
- GLSLC(0, );
- }
- }
- GLSLC(2, barrier(); );
- GLSLC(2, prefix_sum(dst, 1, dst, 1); );
- GLSLC(1, } );
- GLSLC(0, );
+ GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
+ if (!first)
+ GLSLC(1, barrier(); );
+ GLSLC(0, );
+ GLSLF(1, if (pos.y < height[%i]) { ,plane);
+ GLSLC(2, #pragma unroll(1) );
+ GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
+ GLSLC(3, prefix_sum = DTYPE(0); );
+ GLSLC(3, offset = uint64_t(int_stride)*(pos.y + r)*T_ALIGN; );
+ GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
+ GLSLC(0, );
+ GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
+ if (first)
+ insert_first(shd, 0, "r", 0, plane, comp);
+ else
+ GLSLC(4, s2 = dst.v[pos.x]; );
+ GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
+ GLSLC(4, prefix_sum += s2; );
+ GLSLC(3, } );
+ GLSLC(2, } );
+ GLSLC(1, } );
+ GLSLC(0, );
}
static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp)
{
- GLSLF(1, y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
- if (!first) {
- GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
- gl_StorageSemanticsBuffer,
- gl_SemanticsAcquireRelease |
- gl_SemanticsMakeAvailable |
- gl_SemanticsMakeVisible); );
- }
- GLSLF(1, for (x = 0; x < width[%i]; x++) { ,plane);
- GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN); );
-
- for (int r = 0; r < nb_rows; r++) {
- if (first) {
- insert_first(shd, r, 0, plane, comp);
- GLSLF(2, integral_data.v[(y + %i)*int_stride + x] = s2; ,r);
- GLSLC(0, );
- }
- }
-
- GLSLC(2, barrier(); );
- GLSLC(2, prefix_sum(dst, int_stride, dst, int_stride); );
- GLSLC(1, } );
- GLSLC(0, );
+ GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
+ GLSLC(1, #pragma unroll(1) );
+ GLSLF(1, for (r = 0; r < %i; r++) ,nb_rows);
+ GLSLC(2, psum[r] = DTYPE(0); );
+ GLSLC(0, );
+ if (!first)
+ GLSLC(1, barrier(); );
+ GLSLC(0, );
+ GLSLF(1, if (pos.x < width[%i]) { ,plane);
+ GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
+ GLSLC(3, offset = uint64_t(int_stride)*pos.y*T_ALIGN; );
+ GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
+ GLSLC(0, );
+ GLSLC(3, #pragma unroll(1) );
+ GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
+ if (first)
+ insert_first(shd, 0, "r", 1, plane, comp);
+ else
+ GLSLC(4, s2 = dst.v[pos.x + r]; );
+ GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
+ GLSLC(4, psum[r] += s2; );
+ GLSLC(3, } );
+ GLSLC(2, } );
+ GLSLC(1, } );
+ GLSLC(0, );
}
static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
int t, int dst_comp, int plane, int comp)
{
- GLSLF(1, p = patch_size[%i]; ,dst_comp);
+ GLSLF(1, p = patch_size[%i]; ,dst_comp);
GLSLC(0, );
- GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
- gl_StorageSemanticsBuffer,
- gl_SemanticsAcquireRelease |
- gl_SemanticsMakeAvailable |
- gl_SemanticsMakeVisible); );
GLSLC(1, barrier(); );
+ GLSLC(0, );
if (!vert) {
- GLSLF(1, for (y = 0; y < height[%i]; y++) { ,plane);
+ GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
GLSLC(3, break; );
- GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
- GLSLF(3, x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
+ GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
+ GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
} else {
- GLSLF(1, for (x = 0; x < width[%i]; x++) { ,plane);
+ GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
GLSLC(3, break; );
- GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
- GLSLF(3, y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
+ GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
+ GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
}
GLSLC(0, );
GLSLC(3, a = DTYPE(0); );
@@ -174,25 +165,25 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
GLSLC(3, c = DTYPE(0); );
GLSLC(3, d = DTYPE(0); );
GLSLC(0, );
- GLSLC(3, lt = ((x - p) < 0) || ((y - p) < 0); );
+ GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
GLSLC(0, );
if (TYPE_ELEMS == 4) {
- GLSLF(3, src[0] = texture(input_img[%i], ivec2(x + xoffs[0], y + yoffs[0]))[%i]; ,plane, comp);
- GLSLF(3, src[1] = texture(input_img[%i], ivec2(x + xoffs[1], y + yoffs[1]))[%i]; ,plane, comp);
- GLSLF(3, src[2] = texture(input_img[%i], ivec2(x + xoffs[2], y + yoffs[2]))[%i]; ,plane, comp);
- GLSLF(3, src[3] = texture(input_img[%i], ivec2(x + xoffs[3], y + yoffs[3]))[%i]; ,plane, comp);
+ GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
+ GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
+ GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
+ GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
} else {
for (int i = 0; i < 16; i++)
- GLSLF(3, src[%i][%i] = texture(input_img[%i], ivec2(x + xoffs[%i], y + yoffs[%i]))[%i];
- ,i / 4, i % 4, plane, i, i, comp);
+ GLSLF(3, src[%i][%i] = texture(input_img[%i], pos + offs[%i])[%i];
+ ,i / 4, i % 4, plane, i, comp);
}
GLSLC(0, );
GLSLC(3, if (lt == false) { );
- GLSLC(4, a = integral_data.v[(y - p)*int_stride + x - p]; );
- GLSLC(4, c = integral_data.v[(y - p)*int_stride + x + p]; );
- GLSLC(4, b = integral_data.v[(y + p)*int_stride + x - p]; );
- GLSLC(4, d = integral_data.v[(y + p)*int_stride + x + p]; );
+ GLSLC(4, a = integral_data.v[(pos.y - p)*int_stride + pos.x - p]; );
+ GLSLC(4, c = integral_data.v[(pos.y - p)*int_stride + pos.x + p]; );
+ GLSLC(4, b = integral_data.v[(pos.y + p)*int_stride + pos.x - p]; );
+ GLSLC(4, d = integral_data.v[(pos.y + p)*int_stride + pos.x + p]; );
GLSLC(3, } );
GLSLC(0, );
GLSLC(3, patch_diff = d + a - b - c; );
@@ -212,27 +203,26 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert,
}
GLSLC(0, );
if (t > 1) {
- GLSLF(3, atomicAdd(weights_%i[y*ws_stride[%i] + x], w_sum); ,dst_comp, dst_comp);
- GLSLF(3, atomicAdd(sums_%i[y*ws_stride[%i] + x], sum); ,dst_comp, dst_comp);
+ GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
+ GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
} else {
- GLSLF(3, weights_%i[y*ws_stride[%i] + x] += w_sum; ,dst_comp, dst_comp);
- GLSLF(3, sums_%i[y*ws_stride[%i] + x] += sum; ,dst_comp, dst_comp);
+ GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
+ GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
}
GLSLC(2, } );
GLSLC(1, } );
}
typedef struct HorizontalPushData {
- VkDeviceAddress integral_data;
- VkDeviceAddress state_data;
- int32_t xoffs[TYPE_ELEMS];
- int32_t yoffs[TYPE_ELEMS];
uint32_t width[4];
uint32_t height[4];
uint32_t ws_stride[4];
int32_t patch_size[4];
float strength[4];
+ VkDeviceAddress integral_base;
+ uint32_t integral_size;
uint32_t int_stride;
+ uint32_t xyoffs_start;
} HorizontalPushData;
static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
@@ -249,26 +239,18 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
FFVulkanDescriptorSetBinding *desc_set;
int max_dim = FFMAX(width, height);
uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
- int max_shm = vkctx->props.properties.limits.maxComputeSharedMemorySize;
int wg_size, wg_rows;
/* Round the max workgroup size to the previous power of two */
- max_wg = 1 << (31 - ff_clz(max_wg));
wg_size = max_wg;
wg_rows = 1;
if (max_wg > max_dim) {
- wg_size = max_wg / (max_wg / max_dim);
+ wg_size = max_dim;
} else if (max_wg < max_dim) {
- /* First, make it fit */
+ /* Make it fit */
while (wg_size*wg_rows < max_dim)
wg_rows++;
-
- /* Second, make sure there's enough shared memory */
- while ((wg_size * TYPE_SIZE + TYPE_SIZE + 2*4) > max_shm) {
- wg_size >>= 1;
- wg_rows++;
- }
}
RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0));
@@ -278,33 +260,24 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
if (t > 1)
GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
- GLSLC(0, #pragma use_vulkan_memory_model );
- GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable );
GLSLC(0, );
- GLSLF(0, #define N_ROWS %i ,*nb_rows);
- GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x) );
- GLSLF(0, #define LG_WG_SIZE %i ,ff_log2(shd->local_size[0]));
- GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE) );
- GLSLF(0, #define DTYPE %s ,TYPE_NAME);
- GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
+ GLSLF(0, #define DTYPE %s ,TYPE_NAME);
+ GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
GLSLC(0, );
- GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent buffer DataBuffer { );
+ GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { );
GLSLC(1, DTYPE v[]; );
GLSLC(0, }; );
GLSLC(0, );
- GLSLC(0, layout(buffer_reference) buffer StateData; );
- GLSLC(0, );
GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
- GLSLC(1, coherent DataBuffer integral_data; );
- GLSLC(1, StateData state; );
- GLSLF(1, uint xoffs[%i]; ,TYPE_ELEMS);
- GLSLF(1, uint yoffs[%i]; ,TYPE_ELEMS);
GLSLC(1, uvec4 width; );
GLSLC(1, uvec4 height; );
GLSLC(1, uvec4 ws_stride; );
GLSLC(1, ivec4 patch_size; );
GLSLC(1, vec4 strength; );
+ GLSLC(1, DataBuffer integral_base; );
+ GLSLC(1, uint integral_size; );
GLSLC(1, uint int_stride; );
+ GLSLC(1, uint xyoffs_start; );
GLSLC(0, }; );
GLSLC(0, );
@@ -370,42 +343,65 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
};
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0));
- GLSLD( ff_source_prefix_sum_comp );
- GLSLC(0, );
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, uint64_t offset; );
- GLSLC(1, DataBuffer dst; );
- GLSLC(1, float s1; );
- GLSLC(1, DTYPE s2; );
- GLSLC(1, int r; );
- GLSLC(1, int x; );
- GLSLC(1, int y; );
- GLSLC(1, int p; );
- GLSLC(0, );
- GLSLC(1, DTYPE a; );
- GLSLC(1, DTYPE b; );
- GLSLC(1, DTYPE c; );
- GLSLC(1, DTYPE d; );
- GLSLC(0, );
- GLSLC(1, DTYPE patch_diff; );
+ desc_set = (FFVulkanDescriptorSetBinding []) {
+ {
+ .name = "xyoffsets_buffer",
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .mem_quali = "readonly",
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .buf_content = "ivec2 xyoffsets[];",
+ },
+ };
+ RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1, 1, 0));
+
+ GLSLC(0, );
+ GLSLC(0, void main() );
+ GLSLC(0, { );
+ GLSLC(1, uint64_t offset; );
+ GLSLC(1, DataBuffer dst; );
+ GLSLC(1, float s1; );
+ GLSLC(1, DTYPE s2; );
+ GLSLC(1, DTYPE prefix_sum; );
+ GLSLF(1, DTYPE psum[%i]; ,*nb_rows);
+ GLSLC(1, int r; );
+ GLSLC(1, ivec2 pos; );
+ GLSLC(1, int p; );
+ GLSLC(0, );
+ GLSLC(1, DataBuffer integral_data; );
+ GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS);
+ GLSLC(0, );
+ GLSLC(1, int invoc_idx = int(gl_WorkGroupID.z); );
+
+ GLSLC(1, offset = uint64_t(integral_size)*invoc_idx; );
+ GLSLC(1, dst = DataBuffer(uint64_t(integral_data) + offset); );
+
+ GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
+ for (int i = 0; i < TYPE_ELEMS*2; i += 2)
+ GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + 2*%i*invoc_idx + %i]; ,i/2,TYPE_ELEMS,i);
+ GLSLC(0, );
+ GLSLC(1, DTYPE a; );
+ GLSLC(1, DTYPE b; );
+ GLSLC(1, DTYPE c; );
+ GLSLC(1, DTYPE d; );
+ GLSLC(0, );
+ GLSLC(1, DTYPE patch_diff; );
if (TYPE_ELEMS == 4) {
- GLSLC(1, vec4 src; );
- GLSLC(1, vec4 w; );
+ GLSLC(1, vec4 src; );
+ GLSLC(1, vec4 w; );
} else {
- GLSLC(1, vec4 src[4]; );
- GLSLC(1, vec4 w[4]; );
+ GLSLC(1, vec4 src[4]; );
+ GLSLC(1, vec4 w[4]; );
}
- GLSLC(1, float w_sum; );
- GLSLC(1, float sum; );
- GLSLC(0, );
- GLSLC(1, bool lt; );
- GLSLC(1, bool gt; );
- GLSLC(0, );
+ GLSLC(1, float w_sum; );
+ GLSLC(1, float sum; );
+ GLSLC(0, );
+ GLSLC(1, bool lt; );
+ GLSLC(1, bool gt; );
+ GLSLC(0, );
for (int i = 0; i < desc->nb_components; i++) {
int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
- if (width > height) {
+ if (width >= height) {
insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off);
@@ -416,7 +412,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e
}
}
- GLSLC(0, } );
+ GLSLC(0, } );
RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque));
RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main"));
@@ -584,6 +580,8 @@ static av_cold int init_filter(AVFilterContext *ctx)
FFVulkanContext *vkctx = &s->vkctx;
const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
FFVkSPIRVCompiler *spv;
+ int *offsets_buf;
+ int offsets_dispatched = 0, nb_dispatches = 0;
const AVPixFmtDescriptor *desc;
desc = av_pix_fmt_desc_get(vkctx->output_format);
@@ -634,6 +632,20 @@ static av_cold int init_filter(AVFilterContext *ctx)
}
}
+ RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL,
+ VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+ RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0));
+
+ for (int i = 0; i < 2*s->nb_offsets; i += 2) {
+ offsets_buf[i + 0] = s->xoffsets[i >> 1];
+ offsets_buf[i + 1] = s->yoffsets[i >> 1];
+ }
+
+ RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
+
s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, "
@@ -641,11 +653,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
s->opts.t = 1;
}
- if (!vkctx->feats_12.vulkanMemoryModel) {
- av_log(ctx, AV_LOG_ERROR, "Device doesn't support the Vulkan memory model!");
- return AVERROR(EINVAL);;
- }
-
spv = ff_vk_spirv_init();
if (!spv) {
av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
@@ -663,8 +670,19 @@ static av_cold int init_filter(AVFilterContext *ctx)
RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler,
spv, desc, planes));
- av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches, %i parallel\n",
- s->nb_offsets, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS) + 1, s->opts.t);
+ RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, NULL, 1, 0, 0,
+ s->xyoffsets_buf.address, s->xyoffsets_buf.size,
+ VK_FORMAT_UNDEFINED));
+
+ do {
+ int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
+ wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+ offsets_dispatched += wg_invoc * TYPE_ELEMS;
+ nb_dispatches++;
+ } while (offsets_dispatched < s->nb_offsets);
+
+ av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n",
+ s->nb_offsets, nb_dispatches);
s->initialized = 1;
@@ -736,18 +754,16 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
int plane_widths[4];
int plane_heights[4];
+ int offsets_dispatched = 0;
+
/* Integral */
- AVBufferRef *state_buf;
- FFVkBuffer *state_vk;
- AVBufferRef *integral_buf;
+ AVBufferRef *integral_buf = NULL;
FFVkBuffer *integral_vk;
uint32_t int_stride;
size_t int_size;
- size_t state_size;
- int t_offset = 0;
/* Weights/sums */
- AVBufferRef *ws_buf;
+ AVBufferRef *ws_buf = NULL;
FFVkBuffer *ws_vk;
VkDeviceAddress weights_addr[4];
VkDeviceAddress sums_addr[4];
@@ -773,7 +789,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
/* Integral image */
int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows;
int_size = int_stride * int_stride * TYPE_SIZE;
- state_size = int_stride * 3 *TYPE_SIZE;
/* Plane dimensions */
for (int i = 0; i < desc->nb_components; i++) {
@@ -798,16 +813,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
return err;
integral_vk = (FFVkBuffer *)integral_buf->data;
- err = ff_vk_get_pooled_buffer(&s->vkctx, &s->state_buf_pool, &state_buf,
- VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
- VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
- NULL,
- s->opts.t * state_size,
- VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
- if (err < 0)
- return err;
- state_vk = (FFVkBuffer *)state_buf->data;
-
err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
@@ -844,9 +849,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
RET(ff_vk_exec_add_dep_frame(vkctx, exec, out,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0));
- RET(ff_vk_exec_add_dep_buf(vkctx, exec, &state_buf, 1, 0));
+ integral_buf = NULL;
+
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf, 1, 0));
+ ws_buf = NULL;
/* Input frame prep */
RET(ff_vk_create_imageviews(vkctx, exec, in_views, in));
@@ -869,6 +877,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
+ nb_buf_bar = 0;
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = ws_vk->stage,
@@ -881,6 +890,19 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
.size = ws_vk->size,
.offset = 0,
};
+ buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = integral_vk->stage,
+ .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+ .srcAccessMask = integral_vk->access,
+ .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = integral_vk->buf,
+ .size = integral_vk->size,
+ .offset = 0,
+ };
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
@@ -891,10 +913,13 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
});
ws_vk->stage = buf_bar[0].dstStageMask;
ws_vk->access = buf_bar[0].dstAccessMask;
+ integral_vk->stage = buf_bar[1].dstStageMask;
+ integral_vk->access = buf_bar[1].dstAccessMask;
- /* Weights/sums buffer zeroing */
+ /* Buffer zeroing */
vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
+ nb_buf_bar = 0;
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = ws_vk->stage,
@@ -948,29 +973,22 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
/* Weights pipeline */
ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights);
- for (int i = 0; i < s->nb_offsets; i += TYPE_ELEMS) {
- int *xoffs = s->xoffsets + i;
- int *yoffs = s->yoffsets + i;
+ do {
+ int wg_invoc;
HorizontalPushData pd = {
- integral_vk->address + t_offset*int_size,
- state_vk->address + t_offset*state_size,
- { 0 },
- { 0 },
{ plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
{ plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
{ ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
{ s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
{ s->strength[0], s->strength[1], s->strength[2], s->strength[2], },
+ integral_vk->address,
+ int_size,
int_stride,
+ offsets_dispatched * 2,
};
- memcpy(pd.xoffs, xoffs, sizeof(pd.xoffs));
- memcpy(pd.yoffs, yoffs, sizeof(pd.yoffs));
-
- /* Put a barrier once we run out of parallelism buffers */
- if (!t_offset) {
+ if (offsets_dispatched) {
nb_buf_bar = 0;
- /* Buffer prep/sync */
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
.srcStageMask = integral_vk->stage,
@@ -984,39 +1002,28 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
.size = integral_vk->size,
.offset = 0,
};
- buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
- .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
- .srcStageMask = state_vk->stage,
- .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
- .srcAccessMask = state_vk->access,
- .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
- VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
- .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .buffer = state_vk->buf,
- .size = state_vk->size,
- .offset = 0,
- };
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pBufferMemoryBarriers = buf_bar,
.bufferMemoryBarrierCount = nb_buf_bar,
});
- integral_vk->stage = buf_bar[0].dstStageMask;
- integral_vk->access = buf_bar[0].dstAccessMask;
- state_vk->stage = buf_bar[1].dstStageMask;
- state_vk->access = buf_bar[1].dstAccessMask;
+ integral_vk->stage = buf_bar[1].dstStageMask;
+ integral_vk->access = buf_bar[1].dstAccessMask;
}
- t_offset = (t_offset + 1) % s->opts.t;
/* Push data */
ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pd), &pd);
+ wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
+ wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
+
/* End of horizontal pass */
- vk->CmdDispatch(exec->buf, 1, 1, 1);
- }
+ vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
+
+ offsets_dispatched += wg_invoc * TYPE_ELEMS;
+ } while (offsets_dispatched < s->nb_offsets);
RET(denoise_pass(s, exec, ws_vk, ws_stride));
@@ -1033,6 +1040,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
return ff_filter_frame(outlink, out);
fail:
+ av_buffer_unref(&integral_buf);
+ av_buffer_unref(&ws_buf);
av_frame_free(&in);
av_frame_free(&out);
return err;
@@ -1051,7 +1060,6 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
ff_vk_shader_free(vkctx, &s->shd_denoise);
av_buffer_pool_uninit(&s->integral_buf_pool);
- av_buffer_pool_uninit(&s->state_buf_pool);
av_buffer_pool_uninit(&s->ws_buf_pool);
if (s->sampler)
diff --git a/libavfilter/vulkan/prefix_sum.comp b/libavfilter/vulkan/prefix_sum.comp
deleted file mode 100644
index 9147cd82fb..0000000000
--- a/libavfilter/vulkan/prefix_sum.comp
+++ /dev/null
@@ -1,151 +0,0 @@
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-
-#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
-#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
-
-// These correspond to X, A, P respectively in the prefix sum paper.
-#define FLAG_NOT_READY 0u
-#define FLAG_AGGREGATE_READY 1u
-#define FLAG_PREFIX_READY 2u
-
-layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData {
- DTYPE aggregate;
- DTYPE prefix;
- uint flag;
-};
-
-shared DTYPE sh_scratch[WG_SIZE];
-shared DTYPE sh_prefix;
-shared uint sh_part_ix;
-shared uint sh_flag;
-
-void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride)
-{
- DTYPE local[N_ROWS];
- // Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper).
- if (gl_GlobalInvocationID.x == 0)
- sh_part_ix = gl_WorkGroupID.x;
-// sh_part_ix = atomicAdd(part_counter, 1);
-
- barrier();
- uint part_ix = sh_part_ix;
-
- uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
-
- // TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better)
- local[0] = src.v[ix*src_stride];
- for (uint i = 1; i < N_ROWS; i++)
- local[i] = local[i - 1] + src.v[(ix + i)*src_stride];
-
- DTYPE agg = local[N_ROWS - 1];
- sh_scratch[gl_LocalInvocationID.x] = agg;
- for (uint i = 0; i < LG_WG_SIZE; i++) {
- barrier();
- if (gl_LocalInvocationID.x >= (1u << i))
- agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)];
- barrier();
-
- sh_scratch[gl_LocalInvocationID.x] = agg;
- }
-
- // Publish aggregate for this partition
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- state[part_ix].aggregate = agg;
- if (part_ix == 0)
- state[0].prefix = agg;
- }
-
- // Write flag with release semantics
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY;
- atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
- }
-
- DTYPE exclusive = DTYPE(0);
- if (part_ix != 0) {
- // step 4 of paper: decoupled lookback
- uint look_back_ix = part_ix - 1;
-
- DTYPE their_agg;
- uint their_ix = 0;
- while (true) {
- // Read flag with acquire semantics.
- if (gl_LocalInvocationID.x == WG_SIZE - 1)
- sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
-
- // The flag load is done only in the last thread. However, because the
- // translation of memoryBarrierBuffer to Metal requires uniform control
- // flow, we broadcast it to all threads.
- barrier();
-
- uint flag = sh_flag;
- barrier();
-
- if (flag == FLAG_PREFIX_READY) {
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- DTYPE their_prefix = state[look_back_ix].prefix;
- exclusive = their_prefix + exclusive;
- }
- break;
- } else if (flag == FLAG_AGGREGATE_READY) {
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- their_agg = state[look_back_ix].aggregate;
- exclusive = their_agg + exclusive;
- }
- look_back_ix--;
- their_ix = 0;
- continue;
- } // else spins
-
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- // Unfortunately there's no guarantee of forward progress of other
- // workgroups, so compute a bit of the aggregate before trying again.
- // In the worst case, spinning stops when the aggregate is complete.
- DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride];
- if (their_ix == 0)
- their_agg = m;
- else
- their_agg += m;
-
- their_ix++;
- if (their_ix == PARTITION_SIZE) {
- exclusive = their_agg + exclusive;
- if (look_back_ix == 0) {
- sh_flag = FLAG_PREFIX_READY;
- } else {
- look_back_ix--;
- their_ix = 0;
- }
- }
- }
- barrier();
- flag = sh_flag;
- barrier();
- if (flag == FLAG_PREFIX_READY)
- break;
- }
-
- // step 5 of paper: compute inclusive prefix
- if (gl_LocalInvocationID.x == WG_SIZE - 1) {
- DTYPE inclusive_prefix = exclusive + agg;
- sh_prefix = exclusive;
- state[part_ix].prefix = inclusive_prefix;
- }
-
- if (gl_LocalInvocationID.x == WG_SIZE - 1)
- atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
- }
-
- barrier();
- if (part_ix != 0)
- exclusive = sh_prefix;
-
- DTYPE row = exclusive;
- if (gl_LocalInvocationID.x > 0)
- row += sh_scratch[gl_LocalInvocationID.x - 1];
-
- // note - may overwrite
- for (uint i = 0; i < N_ROWS; i++)
- dst.v[(ix + i)*dst_stride] = row + local[i];
-}
More information about the ffmpeg-cvslog
mailing list