[FFmpeg-devel] [PATCH]opencl: automatically select the fastest opencl device

Stefano Sabatini stefasab at gmail.com
Fri Nov 29 14:39:39 CET 2013


On date Tuesday 2013-11-26 06:54:23 -0600, Lenny Wang encoded:
> On Sat, Nov 23, 2013 at 1:13 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
[...]
> Attached patch modified based on your comments, thanks for review in advance.
> 
> 1. added a new generic ffmpeg-util as '-show_opencl', it performs
> tests and benchmark all available opencl devices with output showing
> device/platform indices sorted based on scores, the output can also be
> directly used for setting '-opencl_options'
> 2. added a new av_opencl_test_performance() API for ocl-util, which
> performs tests with user defined benchmark on any opencl device types
> 3. removed CL_DEVICE_TYPE_DEFAULT cause it generates duplicated device
> entries from av_opencl_get_device_list()
> 4. opencl benchmark kernel source separated as /libavutil/opencl_bench_kernel.h
> 5. cosmetics

> From: Lenny Wang <lwanghpc at gmail.com>
> Date: Tue, 26 Nov 2013 06:31:00 -0600
> Subject: [PATCH] cmdutils&opencl: perform tests on available OpenCL devices
> 
> ---
>  cmdutils.c                      | 123 ++++++++++++++++++++++++++++++++++++++++
>  cmdutils.h                      |   6 ++
>  cmdutils_common_opts.h          |   1 +
>  doc/fftools-common-opts.texi    |   3 +
>  doc/utils.texi                  |   4 +-
>  libavutil/opencl.c              |  85 ++++++++++++++++++++++++++-
>  libavutil/opencl.h              |  19 +++++++
>  libavutil/opencl_bench_kernel.h |  86 ++++++++++++++++++++++++++++
>  8 files changed, 324 insertions(+), 3 deletions(-)
> 
> diff --git a/cmdutils.c b/cmdutils.c
> index 2608bce..7282e6c 100644
> --- a/cmdutils.c
> +++ b/cmdutils.c
> @@ -60,6 +60,8 @@
>  #endif
>  #if CONFIG_OPENCL
>  #include "libavutil/opencl.h"
> +#include "libavutil/time.h"
> +#include "libavutil/opencl_bench_kernel.h"
>  #endif
>  
>  
> @@ -986,6 +988,127 @@ int opt_timelimit(void *optctx, const char *opt, const char *arg)
>  }
>  
>  #if CONFIG_OPENCL
> +#define OCLCHECK(method, ... )                                                 \
> +do {                                                                           \
> +    ret = method(__VA_ARGS__);                                                 \
> +    if (ret != CL_SUCCESS) {                                                   \
> +        av_log(NULL, AV_LOG_ERROR, # method " error '%d'\n", ret);             \

opencl_errstr()?

> +        goto end;                                                              \

Also you should set ret to AVERROR_EXTERNAL, since the function is
supposed to return an AVERROR error code (as opposed to an OpenCL
error code).

> +    }                                                                          \
> +} while (0)
> +
> +#define CREATEBUF(out, flags, size)                                            \
> +do {                                                                           \
> +    out = clCreateBuffer(ext_opencl_env->context, flags, size, NULL, &ret);    \
> +    if (ret != CL_SUCCESS) {                                                   \
> +        av_log(NULL, AV_LOG_ERROR, "clCreateBuffer error '%d'\n", ret);        \
> +        return -1;                                                             \

AVERROR_EXTERNAL

> +    }                                                                          \
> +} while (0)
> +
> +static void fill_rand_int(int *data, int n)
> +{
> +    int i;
> +    srand(av_gettime());
> +    for (i = 0; i < n; i++)
> +        data[i] = rand();
> +}
> +
> +#define OPENCL_NB_ITER 5
> +static int run_opencl_bench(AVOpenCLExternalEnv *ext_opencl_env)
> +{
> +    int i, ret = 0, arg = 0;
> +    int width = 1920, height = 1088;
> +    int64_t start = 0;
> +    size_t kernel_len;
> +    char *inbuf;
> +    int *mask;
> +    int buf_size = width * height * sizeof(char);
> +    int mask_size = sizeof(uint32_t) * 128;
> +
> +    cl_mem cl_mask, cl_inbuf, cl_outbuf;
> +    cl_kernel kernel = NULL;
> +    cl_program program = NULL;
> +    size_t local_work_size_2d[2] = {16, 16};
> +    size_t global_work_size_2d[2] = {(size_t)width, (size_t)height};
> +
> +    if (!(inbuf = av_malloc(buf_size))) {
> +        av_log(NULL, AV_LOG_ERROR, "Could not allocate buffer\n");
> +        return AVERROR(ENOMEM);
> +    }
> +    if (!(mask = av_malloc(mask_size))) {
> +        av_log(NULL, AV_LOG_ERROR, "Could not allocate buffer\n");
> +        return AVERROR(ENOMEM);
> +    }
> +    fill_rand_int((int*)inbuf, buf_size/4);
> +    fill_rand_int(mask, mask_size/4);
> +
> +    CREATEBUF(cl_mask, CL_MEM_READ_ONLY, mask_size);
> +    CREATEBUF(cl_inbuf, CL_MEM_READ_ONLY, buf_size);
> +    CREATEBUF(cl_outbuf, CL_MEM_READ_WRITE, buf_size);
> +
> +    kernel_len = strlen(ocl_bench_source);
> +    program = clCreateProgramWithSource(ext_opencl_env->context, 1, &ocl_bench_source,
> +                                        &kernel_len, &ret);
> +    if (ret != CL_SUCCESS || !program) {
> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark program\n");
> +        goto end;
> +    }

> +    ret = clBuildProgram(program, 1, &(ext_opencl_env->device_id), NULL, NULL, NULL);
> +    if (ret != CL_SUCCESS) {
> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to build benchmark program\n");
> +        goto end;
> +    }
> +    kernel = clCreateKernel(program, "unsharp_bench", &ret);
> +    if (ret != CL_SUCCESS) {
> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark kernel\n");
> +        goto end;
> +    }

ret = AVERROR_EXTERNAL

> +
> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_inbuf, CL_TRUE, 0,
> +                                    buf_size, inbuf, 0, NULL, NULL);
> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_mask, CL_TRUE, 0,
> +                                    mask_size, mask, 0, NULL, NULL);

nit: weird align

> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_inbuf);
> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_outbuf);
> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_mask);
> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &width);
> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &height);
> +    /* warm up */
> +    OCLCHECK(clEnqueueNDRangeKernel, ext_opencl_env->command_queue, kernel, 2, NULL,
> +                global_work_size_2d, local_work_size_2d, 0, NULL, NULL);
> +    clFinish(ext_opencl_env->command_queue);
> +
> +    start = av_gettime();
> +    for (i = 0; i < OPENCL_NB_ITER; i++)
> +	    OCLCHECK(clEnqueueNDRangeKernel, ext_opencl_env->command_queue, kernel, 2, NULL,
> +                    global_work_size_2d, local_work_size_2d, 0, NULL, NULL);
> +	clFinish(ext_opencl_env->command_queue);
> +    return (int)((av_gettime() - start)/OPENCL_NB_ITER);

> +end:
> +    if (kernel)
> +        clReleaseKernel(kernel);
> +    if (program)
> +        clReleaseProgram(program);
> +    if (cl_inbuf)
> +        clReleaseMemObject(cl_inbuf);
> +    if (cl_outbuf)
> +        clReleaseMemObject(cl_outbuf);
> +    if (cl_mask)
> +        clReleaseMemObject(cl_mask);
> +    av_free(inbuf);
> +    av_free(mask);
> +    return -1;

aren't you leaking these?

> +}
> +
> +void show_opencl(void *optctx, const char *opt, const char *arg)
> +{
> +    AVOpenCLDeviceList *device_list;
> +    av_opencl_get_device_list(&device_list);
> +    av_opencl_test_performance(device_list, run_opencl_bench, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_CPU);
> +    av_opencl_free_device_list(&device_list);
> +}
> +
>  int opt_opencl(void *optctx, const char *opt, const char *arg)
>  {
>      char *key, *value;
> diff --git a/cmdutils.h b/cmdutils.h
> index b814961..071c7aa 100644
> --- a/cmdutils.h
> +++ b/cmdutils.h
> @@ -497,6 +497,12 @@ int show_sample_fmts(void *optctx, const char *opt, const char *arg);
>  int show_colors(void *optctx, const char *opt, const char *arg);
>  

>  /**
> + * Print a listing containing all the opencl platforms and devices with
> + * benchmark scores.

nit: OpenCL

> + */
> +void show_opencl(void *optctx, const char *opt, const char *arg);
> +
> +/**
>   * Return a positive value if a line read from standard input
>   * starts with [yY], otherwise return 0.
>   */
> diff --git a/cmdutils_common_opts.h b/cmdutils_common_opts.h
> index 3e3f0ac..7938b06 100644
> --- a/cmdutils_common_opts.h
> +++ b/cmdutils_common_opts.h
> @@ -22,5 +22,6 @@
>      { "max_alloc"  , HAS_ARG,  {.func_arg = opt_max_alloc},     "set maximum size of a single allocated block", "bytes" },
>      { "cpuflags"   , HAS_ARG | OPT_EXPERT, { .func_arg = opt_cpuflags }, "force specific cpu flags", "flags" },
>  #if CONFIG_OPENCL
> +    { "show_opencl", OPT_EXIT, {.func_arg = show_opencl},       "run OpenCL benchmark across available devices" },

It is sometimes useful to show available devices without testing their
performance.

What about having a separate options like:
show_opencl
bench_opencl

>      { "opencl_options", HAS_ARG, {.func_arg = opt_opencl},      "set OpenCL environment options" },

Also probably "opencl_" as prefix would be more consistent with
opencl_options.

>  #endif
> diff --git a/doc/fftools-common-opts.texi b/doc/fftools-common-opts.texi
> index 617af2f..fbcafba 100644
> --- a/doc/fftools-common-opts.texi
> +++ b/doc/fftools-common-opts.texi
> @@ -250,6 +250,9 @@ Possible flags for this option are:
>  @end table
>  @end table
>  
> + at item -show_opencl
> +Test and benchmark available OpenCL devices.
> +
>  @item -opencl_options options (@emph{global})
>  Set OpenCL environment options. This option is only available when
>  FFmpeg has been compiled with @code{--enable-opencl}.
> diff --git a/doc/utils.texi b/doc/utils.texi
> index 9377139..4583a20 100644
> --- a/doc/utils.texi
> +++ b/doc/utils.texi
> @@ -1051,13 +1051,13 @@ See reference "OpenCL Specification Version: 1.2 chapter 5.6.4".
>  Select the index of the platform to run OpenCL code.
>  
>  The specified index must be one of the indexes in the device list
> -which can be obtained with @code{av_opencl_get_device_list()}.
> +which can be obtained with @code{ffmpeg -show_opencl} or @code{av_opencl_get_device_list()}.
>  
>  @item device_idx
>  Select the index of the device used to run OpenCL code.
>  
>  The specifed index must be one of the indexes in the device list which
> -can be obtained with @code{av_opencl_get_device_list()}.
> +can be obtained with @code{ffmpeg -show_opencl} or @code{av_opencl_get_device_list()}.
>  
>  @end table
>  
> diff --git a/libavutil/opencl.c b/libavutil/opencl.c
> index ae4c476..a63349e 100644
> --- a/libavutil/opencl.c
> +++ b/libavutil/opencl.c
> @@ -98,7 +98,7 @@ static const AVClass openclutils_class = {
>  
>  static OpenclContext opencl_ctx = {&openclutils_class};
>  
> -static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT};
> +static const cl_device_type device_type[] = {CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU};

Again, this seems unrelated, and should probably go to a separate
patch.

>  typedef struct {
>      int err_code;
> @@ -761,3 +761,86 @@ int av_opencl_buffer_read_image(uint8_t **dst_data, int *plane_size, int plane_n
>      }
>      return 0;
>  }
> +
> +static int compare_ocl_device_desc(const void *a, const void *b)
> +{
> +    return ((oclDeviceDescriptor*)a)->runtime - ((oclDeviceDescriptor*)b)->runtime;
> +}
> +
> +int av_opencl_test_performance(AVOpenCLDeviceList *device_list,
> +        int (*run_benchmark)(AVOpenCLExternalEnv *ext_opencl_env), int device_type)

nit: weird indent

Also possibly shorter: av_opencl_benchmark or _run_benchmark

> +{
> +    int i, j, nb_devices = 0, count = 0;
> +    int time = 0;
> +    cl_int status;
> +    cl_context_properties cps[3];
> +    AVOpenCLDeviceNode *device_node = NULL;
> +    AVOpenCLExternalEnv *ext_opencl_env = NULL;
> +    oclDeviceDescriptor *devices = NULL;
> +
> +    for (i = 0; i < device_list->platform_num; i++)
> +        nb_devices += device_list->platform_node[i]->device_num;

> +    if (!nb_devices) {
> +        av_log(NULL, AV_LOG_ERROR, "No OpenCL device detected!\n");
> +        return -1;

What about using the openclutils_class for logging this? Same for the
other logs.

> +    }
> +    if (!(devices = av_malloc(sizeof(oclDeviceDescriptor) * nb_devices))) {
> +        av_log(NULL, AV_LOG_ERROR, "Could not allocate buffer\n");
> +        return AVERROR(ENOMEM);
> +    }
> +    ext_opencl_env = av_opencl_alloc_external_env();
> +
> +    for (i = 0; i < device_list->platform_num; i++) {
> +        for (j = 0; j < device_list->platform_node[i]->device_num; j++) {
> +            device_node = device_list->platform_node[i]->device_node[j];
> +            ext_opencl_env->device_type = device_node->device_type;
> +            if (!(device_node->device_type & device_type))
> +                continue;
> +            ext_opencl_env->device_id = device_node->device_id;
> +            ext_opencl_env->platform_id = device_list->platform_node[i]->platform_id;

> +            av_log(NULL, AV_LOG_INFO, "Performing test on OpenCL platform %d: %s, device %d: %s\n",
> +                   i, device_list->platform_node[i]->platform_name, j, device_node->device_name);

VERBOSE

> +
> +            cps[0] = CL_CONTEXT_PLATFORM;
> +            cps[1] = (cl_context_properties)ext_opencl_env->platform_id;
> +            cps[2] = 0;
> +            ext_opencl_env->context = clCreateContextFromType(cps, ext_opencl_env->device_type,
> +                                                                NULL, NULL, &status);
> +            if (status != CL_SUCCESS || !ext_opencl_env->context) {
> +                if (ext_opencl_env->context)
> +                    clReleaseContext(ext_opencl_env->context);
> +                continue;
> +            }
> +            ext_opencl_env->command_queue = clCreateCommandQueue(ext_opencl_env->context,
> +                                                        ext_opencl_env->device_id, 0, &status);
> +            if (status != CL_SUCCESS || !ext_opencl_env->command_queue) {
> +                if (ext_opencl_env->command_queue)
> +                    clReleaseCommandQueue(ext_opencl_env->command_queue);
> +                if (ext_opencl_env->context)
> +                    clReleaseContext(ext_opencl_env->context);
> +                continue;
> +            }
> +            time = run_benchmark(ext_opencl_env);
> +            if (time > 0) {
> +                devices[count].platform_idx = i;
> +                devices[count].device_idx = j;
> +                devices[count].runtime = time;
> +                strcpy(devices[count].device_name, device_node->device_name);
> +                count++;
> +            } else {
> +                av_log(NULL, AV_LOG_ERROR, "Benchmark failed with this device!\n");
> +            }
> +            clReleaseCommandQueue(ext_opencl_env->command_queue);
> +            clReleaseContext(ext_opencl_env->context);
> +        }
> +    }
> +

> +    qsort(devices, count, sizeof(oclDeviceDescriptor), compare_ocl_device_desc);
> +    for (i = 0; i < count; i++)
> +        av_log(NULL, AV_LOG_WARNING, "platform_idx=%d\tdevice_idx=%d\tdevice_name=%s\truntime=%d\n",
> +                devices[i].platform_idx, devices[i].device_idx, devices[i].device_name, devices[i].runtime);

This is of limited use for a programmer, since she's not supposed to
parse log output to get meaningful information. 

You should either export the structure, or return every single field
in the function through reference passing, but this would be a bit
ugly.

> +
> +    av_opencl_free_external_env(&ext_opencl_env);
> +    av_free(devices);
> +    return 0;
> +}
> diff --git a/libavutil/opencl.h b/libavutil/opencl.h
> index e4ecbf8..166645a 100644
> --- a/libavutil/opencl.h
> +++ b/libavutil/opencl.h
> @@ -57,6 +57,13 @@ typedef struct {
>  } AVOpenCLDeviceNode;
>  

>  typedef struct {
> +    int device_idx;
> +    int platform_idx;
> +    int runtime;
> +    char device_name[AV_OPENCL_MAX_DEVICE_NAME_SIZE];
> +} oclDeviceDescriptor;

Is this currently exposed? If not it should not be published in
opencl.h.

Also it needs consistent prefix, like AVOpenCLDeviceBenchmark.

> +
> +typedef struct {
>      cl_platform_id platform_id;
>      char platform_name[AV_OPENCL_MAX_PLATFORM_NAME_SIZE];
>      int device_num;
> @@ -310,4 +317,16 @@ void av_opencl_release_kernel(AVOpenCLKernelEnv *env);
>   */
>  void av_opencl_uninit(void);
>  
> +/**

> + * Benchmark OpenCL devices with input function

missing final point

> + *
> + * @param device_list       pointer to OpenCL device list

> + * @param run_benchmark     pointer to function implements the benchmark

which implements the benchmark

Also from this doxy it is not really clear what the function should
do, one or two sentences should make it more clear.

> + * @param device_type       specify CL_DEVICE_TYPE to be tested, one of or
> + *                          a combination of CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU
> + * @return >=0 on success, a negative error code in case of failure
> + */
> +int av_opencl_test_performance(AVOpenCLDeviceList *device_list,
> +        int (*run_benchmark)(AVOpenCLExternalEnv *ext_opencl_env),

> int device_type);

int -> cl_device_type

Another problem is that this function is not very flexible, for
example if the user only wants to run the benchmark of a single
device.

What about:
int av_opencl_benchmark(AVOpenCLDeviceNode *device,
                        int (*benchmark)(AVOpenCLExternalEnv *ext_opencl_env));

return the number of time UNITs for running the device, a negative
value in case of failure.
?

So you can sort and print the results at the application level.

>  #endif /* LIBAVUTIL_OPENCL_H */
> diff --git a/libavutil/opencl_bench_kernel.h b/libavutil/opencl_bench_kernel.h
> new file mode 100644
> index 0000000..9ee4b3a
> --- /dev/null
> +++ b/libavutil/opencl_bench_kernel.h
> @@ -0,0 +1,86 @@
> +/*
> + * Copyright (C) 2013 Lenny Wang
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef OPENCL_BENCH_KERNEL_H
> +#define OPENCL_BENCH_KERNEL_H

Since this is not used by the library, it should probably stay outside
the library, for example in cmdutils_opencl_bench_kernel.h.
-- 
FFmpeg = Fancy and Fast Mystic Pitiless Elfic Guide


More information about the ffmpeg-devel mailing list