[FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support draw text with detection bounding boxes in side_data
Guo, Yejun
yejun.guo at intel.com
Wed May 26 04:14:45 EEST 2021
> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of Guo,
> Yejun
> Sent: 2021年5月25日 9:08
> To: FFmpeg development discussions and patches
> <ffmpeg-devel at ffmpeg.org>
> Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter
> support draw text with detection bounding boxes in side_data
>
>
>
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of
> Guo,
> > Yejun
> > Sent: 2021年5月20日 11:04
> > To: FFmpeg development discussions and patches
> > <ffmpeg-devel at ffmpeg.org>
> > Subject: Re: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter
> > support draw text with detection bounding boxes in side_data
> >
> >
> >
> > > -----Original Message-----
> > > From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of
> Ting
> > > Fu
> > > Sent: 2021年5月14日 16:47
> > > To: ffmpeg-devel at ffmpeg.org
> > > Subject: [FFmpeg-devel] [PATCH 3/3] libavfilter: vf_drawtext filter support
> > > draw text with detection bounding boxes in side_data
> > >
> > > This feature can be used with dnn detection by setting vf_drawtext's
> option
> > > text_source=side_data_detection_bboxes, for example:
> > > ./ffmpeg -i face.jpeg -vf
> > >
> >
> dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
> > >
> >
> input=data:output=detection_out:labels=face-detection-adas-0001.label,dra
> > > wbox=box_source=
> > >
> >
> side_data_detection_bboxes,drawtext=text_source=side_data_detection_bbo
> > > xes:fontcolor=green:\
> > > fontsize=40, -y face_detect.jpeg
> > > Please note, the default fontsize of vf_drawtext is 12, which may be too
> > > small to be seen clearly.
> > >
> > > Signed-off-by: Ting Fu <ting.fu at intel.com>
> > > ---
> > > doc/filters.texi | 8 ++++
> > > libavfilter/vf_drawtext.c | 77
> > > ++++++++++++++++++++++++++++++++++++---
> > > 2 files changed, 79 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/doc/filters.texi b/doc/filters.texi
> > > index f2ac8c4cc8..d10e6de03d 100644
> > > --- a/doc/filters.texi
> > > +++ b/doc/filters.texi
> > > @@ -10788,6 +10788,14 @@ parameter @var{text}.
> > >
> > > If both @var{text} and @var{textfile} are specified, an error is thrown.
> > >
> > > + at item text_source
> > > +Text source should be set as side_data_detection_bboxes if you want to
> > use
> > > text data in
> > > +detection bboxes of side data.
> > > +
> > > +If text source is set, @var{text} and @var{textfile} will be ignored and still
> > > use
> > > +text data in detection bboxes of side data. So please do not use this
> > > parameter
> > > +if you are not sure about the text source.
> > > +
> > > @item reload
> > > If set to 1, the @var{textfile} will be reloaded before each frame.
> > > Be sure to update it atomically, or it may be read partially, or even fail.
> > > diff --git a/libavfilter/vf_drawtext.c b/libavfilter/vf_drawtext.c
> > > index 7ea057b812..382d589e26 100644
> > > --- a/libavfilter/vf_drawtext.c
> > > +++ b/libavfilter/vf_drawtext.c
> > > @@ -55,6 +55,7 @@
> > > #include "libavutil/time_internal.h"
> > > #include "libavutil/tree.h"
> > > #include "libavutil/lfg.h"
> > > +#include "libavutil/detection_bbox.h"
> > > #include "avfilter.h"
> > > #include "drawutils.h"
> > > #include "formats.h"
> > > @@ -199,6 +200,8 @@ typedef struct DrawTextContext {
> > > int tc24hmax; ///< 1 if timecode is wrapped
> to
> > 24
> > > hours, 0 otherwise
> > > int reload; ///< reload text file for each
> frame
> > > int start_number; ///< starting frame number for
> > > n/frame_num var
> > > + char *text_source_string; ///< the string to specify text data
> > > source
> > > + enum AVFrameSideDataType text_source;
> > > #if CONFIG_LIBFRIBIDI
> > > int text_shaping; ///< 1 to shape the text before
> > > drawing it
> > > #endif
> > > @@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
> > > { "alpha", "apply alpha while rendering", OFFSET(a_expr),
> > > AV_OPT_TYPE_STRING, { .str = "1" }, .flags = FLAGS },
> > > {"fix_bounds", "check and fix text coords to avoid clipping",
> > > OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
> > > {"start_number", "start frame number for n/frame_num variable",
> > > OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
> > > + {"text_source", "the source of text", OFFSET(text_source_string),
> > > AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
> > >
> > > #if CONFIG_LIBFRIBIDI
> > > {"text_shaping", "attempt to shape text before drawing",
> > > OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
> > > @@ -690,6 +694,16 @@ out:
> > > }
> > > #endif
> > >
> > > +static enum AVFrameSideDataType text_source_string_parse(const char
> > > *text_source_string)
> > > +{
> > > + av_assert0(text_source_string);
> > > + if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
> > > + return AV_FRAME_DATA_DETECTION_BBOXES;
> > > + } else {
> > > + return AVERROR(EINVAL);
> > > + }
> > > +}
> > > +
> > > static av_cold int init(AVFilterContext *ctx)
> > > {
> > > int err;
> > > @@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
> > > s->text = av_strdup("");
> > > }
> > >
> > > + if (s->text_source_string) {
> > > + s->text_source =
> text_source_string_parse(s->text_source_string);
> > > + if ((int)s->text_source < 0) {
> > > + av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n",
> > > s->text_source_string);
> > > + return AVERROR(EINVAL);
> > > + }
> > > + }
> > > +
> > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > > + if (s->text) {
> > > + av_log(ctx, AV_LOG_WARNING, "Multiple texts provided,
> will
> > > use text_source only\n");
> > > + av_free(s->text);
> > > + }
> > > + s->text =
> > > av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
> > > + (AV_NUM_DETECTION_BBOX_CLASSIFY
> +
> > > 1));
> > > + if (!s->text)
> > > + return AVERROR(ENOMEM);
> > > + }
> > > +
> > > if (!s->text) {
> > > av_log(ctx, AV_LOG_ERROR,
> > > - "Either text, a valid file or a timecode must be
> > > provided\n");
> > > + "Either text, a valid file, a timecode or text source must
> > be
> > > provided\n");
> > > return AVERROR(EINVAL);
> > > }
> > >
> > > @@ -1440,10 +1473,15 @@ continue_on_invalid2:
> > >
> > > s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] =
> > s->max_glyph_h;
> > >
> > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > > &s->prng);
> > > - s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values,
> > > &s->prng);
> > > - /* It is necessary if x is expressed from y */
> > > - s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values,
> > > &s->prng);
> > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
> > > + s->var_values[VAR_X] = s->x;
> > > + s->var_values[VAR_Y] = s->y;
> > > + } else {
> > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > > s->var_values, &s->prng);
> > > + s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr,
> > > s->var_values, &s->prng);
> > > + /* It is necessary if x is expressed from y */
> > > + s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr,
> > > s->var_values, &s->prng);
> > > + }
> > >
> > > update_alpha(s);
> > > update_color_with_alpha(s, &fontcolor , s->fontcolor );
> > > @@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink,
> > > AVFrame *frame)
> > > AVFilterLink *outlink = ctx->outputs[0];
> > > DrawTextContext *s = ctx->priv;
> > > int ret;
> > > + const AVDetectionBBoxHeader *header = NULL;
> > > + const AVDetectionBBox *bbox;
> > > + AVFrameSideData *sd;
> > > + int loop = 1;
> > > +
> > > + if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
> > > + sd = av_frame_get_side_data(frame,
> > > AV_FRAME_DATA_DETECTION_BBOXES);
> > > + if (sd) {
> > > + header = (AVDetectionBBoxHeader *)sd->data;
> > > + loop = header->nb_bboxes;
> > > + } else {
> > > + av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
> > > + return ff_filter_frame(outlink, frame);
> > > + }
> > > + }
> > >
> > > if (s->reload) {
> > > if ((ret = load_textfile(ctx)) < 0) {
> > > @@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink,
> > > AVFrame *frame)
> > > s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
> > > s->metadata = frame->metadata;
> > >
> > > - draw_text(ctx, frame, frame->width, frame->height);
> > > + for (int i = 0; i < loop; i++) {
> > > + if (header) {
> > > + bbox = av_get_detection_bbox(header, i);
> > > + strcpy(s->text, bbox->detect_label);
> > > + for (int j = 0; j < bbox->classify_count; j++) {
> > > + strcat(s->text, ", ");
> > > + strcat(s->text, bbox->classify_labels[j]);
> > > + }
> > > + s->x = bbox->x;
> > > + s->y = bbox->y - s->fontsize;
> > > + }
> > > + draw_text(ctx, frame, frame->width, frame->height);
> > > + }
> > >
> > > av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d
> > > y:%d\n",
> > > (int)s->var_values[VAR_N], s->var_values[VAR_T],
> > > --
> >
> > any comment? thanks.
> >
> > A new option is added into vf_drawbox and vf_drawtext to visualize the
> > data from detection bounding boxes in the side data of AVFrame.
> >
>
> will push tomorrow if there's no objection.
will push soon
More information about the ffmpeg-devel
mailing list