[FFmpeg-devel] [PATCH v2 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function
Guo, Yejun
yejun.guo at intel.com
Fri Sep 4 16:34:36 EEST 2020
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of
> xujunzz at sjtu.edu.cn
> Sent: 2020年9月4日 20:46
> To: ffmpeg-devel at ffmpeg.org
> Cc: xujunzz at sjtu.edu.cn
> Subject: [FFmpeg-devel] [PATCH v2 2/2]
> dnn_backend_native_layer_conv2d.c:Add mutithread function
>
> From: Xu Jun <xujunzz at sjtu.edu.cn>
>
> v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test
> dnn-layer-conv2d-test.c
>
> Use pthread to multithread dnn_execute_layer_conv2d.
> Can be tested with command "./ffmpeg_g -i input.png -vf \
> format=yuvj420p,dnn_processing=dnn_backend=native:model= \
> espcn.model:input=x:output=y:options=conv2d_threads=23 \
> -y sr_native.jpg -benchmark"
>
> before patch: utime=11.238s stime=0.005s rtime=11.248s
> after patch: utime=20.817s stime=0.047s rtime=1.051s
> on my 3900X 12c24t @4.2GHz
>
> About the increase of utime, it's because that CPU HyperThreading
> technology makes logical cores twice of physical cores while cpu's
> counting performance improves less than double. And utime sums
> all cpu's logical cores' runtime. As a result, using threads num
> near cpu's logical core's number will double utime, while reduce
> rtime less than half for HyperThreading CPUs.
>
> Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
> ---
> .../dnn/dnn_backend_native_layer_conv2d.c | 107 ++++++++++++++++--
> tests/dnn/dnn-layer-conv2d-test.c | 12 +-
> 2 files changed, 106 insertions(+), 13 deletions(-)
>
> diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> index d079795bf8..4068a13ab4 100644
> --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> @@ -19,10 +19,27 @@
> */
>
> #include "libavutil/avassert.h"
> +#include "libavutil/thread.h"
> +#include "libavutil/cpu.h"
> #include "dnn_backend_native_layer_conv2d.h"
>
> #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
>
> +//struct to pass parameters
> +typedef struct thread_common_param{
> + DnnOperand *operands;
> + const int32_t *input_operand_indexes;
> + int32_t output_operand_index;
> + const void *parameters;
> + NativeContext *ctx;
> + int thread_num;
> +} thread_common_param;
> +
> +typedef struct thread_param{
> + thread_common_param *thread_common_param;
> + int thread_index;
> +} thread_param;
> +
> int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int
> file_size, int operands_num)
> {
> ConvolutionalParams *conv_params;
> @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext
> *model_file_context, int fil
> return dnn_size;
> }
>
> -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
> *input_operand_indexes,
> - int32_t output_operand_index, const void
> *parameters, NativeContext *ctx)
> +static void * dnn_execute_layer_conv2d_thread(void *threadarg)
> {
> + //pass parameters
> + thread_param *thread_param = (struct thread_param *)threadarg;
> + thread_common_param *thread_common_param =
> thread_param->thread_common_param;
> + DnnOperand *operands = thread_common_param->operands;
> float *output;
> - int32_t input_operand_index = input_operand_indexes[0];
> + int32_t input_operand_index =
> thread_common_param->input_operand_indexes[0];
> int number = operands[input_operand_index].dims[0];
> int height = operands[input_operand_index].dims[1];
> int width = operands[input_operand_index].dims[2];
> int channel = operands[input_operand_index].dims[3];
> const float *input = operands[input_operand_index].data;
> - const ConvolutionalParams *conv_params = (const ConvolutionalParams
> *)parameters;
> + const ConvolutionalParams *conv_params = (const ConvolutionalParams
> *)(thread_common_param->parameters);
>
> int radius = conv_params->kernel_size >> 1;
> int src_linesize = width * conv_params->input_num;
> @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand
> *operands, const int32_t *input_operand_
> int filter_size = conv_params->kernel_size * filter_linesize;
> int pad_size = (conv_params->padding_method == VALID) ?
> (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> - DnnOperand *output_operand = &operands[output_operand_index];
> + int thread_stride = (height - pad_size * 2) /
> thread_common_param->thread_num;
> + int thread_start = thread_stride * thread_param->thread_index +
> pad_size;
> + int thread_end = (thread_param->thread_index ==
> thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start
> + thread_stride);
> +
> + DnnOperand *output_operand =
> &operands[thread_common_param->output_operand_index];
> output_operand->dims[0] = number;
> output_operand->dims[1] = height - pad_size * 2;
> output_operand->dims[2] = width - pad_size * 2;
> @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand
> *operands, const int32_t *input_operand_
> output_operand->data_type =
> operands[input_operand_index].data_type;
> output_operand->length =
> calculate_operand_data_length(output_operand);
> if (output_operand->length <= 0) {
> - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
> - return DNN_ERROR;
> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output
> data length overflow\n");
> + return (void *)DNN_ERROR;
> }
> output_operand->data = av_realloc(output_operand->data,
> output_operand->length);
> if (!output_operand->data) {
> - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for
> output\n");
> - return DNN_ERROR;
> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to
> reallocate memory for output\n");
> + return (void *)DNN_ERROR;
> }
> +
> output = output_operand->data;
> + output += (conv_params->output_num) * (width - 2 * pad_size) *
> (thread_start - pad_size);
>
> av_assert0(channel == conv_params->input_num);
>
> - for (int y = pad_size; y < height - pad_size; ++y) {
> + for (int y = thread_start; y < thread_end; ++y) {
> for (int x = pad_size; x < width - pad_size; ++x) {
> for (int n_filter = 0; n_filter < conv_params->output_num;
> ++n_filter) {
> if (conv_params->has_bias)
> @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand
> *operands, const int32_t *input_operand_
> output += conv_params->output_num;
> }
> }
> - return 0;
> + return (void *)0;
> +}
> +
> +
> +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
> *input_operand_indexes,
> + int32_t output_operand_index, const void
> *parameters, NativeContext *ctx)
> +{
> + int thread_num = (ctx->options.conv2d_threads <= 0 ||
> ctx->options.conv2d_threads > av_cpu_count())
> + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
> +#if HAVE_PTHREAD_CANCEL
> + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
> +#endif
> + thread_param **thread_param = av_malloc(thread_num *
> sizeof(*thread_param));
> + void *res;
> + int error_flag = 0;
> +
> + //struct used to pass parameters
> + thread_common_param thread_common_param;
> + thread_common_param.operands = operands;
> + thread_common_param.input_operand_indexes =
> input_operand_indexes;
> + thread_common_param.output_operand_index =
> output_operand_index;
> + thread_common_param.parameters = parameters;
> + thread_common_param.ctx = ctx;
> +#if HAVE_PTHREAD_CANCEL
> + thread_common_param.thread_num = thread_num;
> +
> + //create threads
> + for (int i = 0; i < thread_num; i++){
> + thread_param[i] = av_malloc(sizeof(thread_param));
> + thread_param[i]->thread_common_param =
> &thread_common_param;
> + thread_param[i]->thread_index = i;
> + pthread_create(&thread_id[i], NULL,
> dnn_execute_layer_conv2d_thread, (void *)thread_param[i]);
> + }
> +
> + //join threads, res gets function return
> + for (int i = 0; i < thread_num; i++){
> + pthread_join(thread_id[i], &res);
> + if ((int)res != 0)
> + error_flag = (int)res;
> + }
> +
> + //release memory
> + av_free(thread_id);
> +
> + for (int i = 0; i < thread_num; i++){
> + av_free(thread_param[i]);
> + }
> +#else
> + thread_common_param.thread_num = 1;
> + thread_param[0] = av_malloc(sizeof(thread_param));
> + thread_param[0]->thread_common_param = &thread_common_param;
> + thread_param[0]->thread_index = 0;
> + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
> + if ((int)res != 0)
> + error_flag = (int)res;
> + av_free(thread_param[0]);
> +#endif
> +
> + av_free(thread_param);
> + return error_flag;
> }
> diff --git a/tests/dnn/dnn-layer-conv2d-test.c
> b/tests/dnn/dnn-layer-conv2d-test.c
> index 836839cc64..f695fcee0d 100644
> --- a/tests/dnn/dnn-layer-conv2d-test.c
> +++ b/tests/dnn/dnn-layer-conv2d-test.c
> @@ -96,6 +96,10 @@ static int test_with_same_dilate(void)
> };
> float bias[2] = { -1.6574852, -0.72915393 };
>
> + NativeContext ctx;
> + ctx.class = &dnn_native_class;
> + ctx.options.conv2d_threads = 1;
> +
> params.activation = TANH;
> params.has_bias = 1;
> params.biases = bias;
> @@ -114,7 +118,7 @@ static int test_with_same_dilate(void)
> operands[1].data = NULL;
>
> input_indexes[0] = 0;
> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL);
> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx);
>
> output = operands[1].data;
> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
> @@ -196,6 +200,10 @@ static int test_with_valid(void)
> };
> float bias[2] = { -0.4773722, -0.19620377 };
>
> + NativeContext ctx;
> + ctx.class = &dnn_native_class;
> + ctx.options.conv2d_threads = 1;
> +
> params.activation = TANH;
> params.has_bias = 1;
> params.biases = bias;
> @@ -214,7 +222,7 @@ static int test_with_valid(void)
> operands[1].data = NULL;
>
> input_indexes[0] = 0;
> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL);
> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx);
>
> output = operands[1].data;
> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
LGTM, thanks.
More information about the ffmpeg-devel
mailing list