FFmpeg
vf_dnn_detect.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /**
20  * @file
21  * implementing an object detecting filter using deep learning networks.
22  */
23 
24 #include "libavutil/file_open.h"
25 #include "libavutil/opt.h"
26 #include "filters.h"
27 #include "dnn_filter_common.h"
28 #include "internal.h"
29 #include "video.h"
30 #include "libavutil/time.h"
31 #include "libavutil/avstring.h"
33 #include "libavutil/fifo.h"
34 
35 typedef enum {
39 
40 typedef struct DnnDetectContext {
41  const AVClass *class;
43  float confidence;
45  char **labels;
48  int cell_w;
49  int cell_h;
54  char *anchors_str;
55  float *anchors;
56  int nb_anchor;
58 
59 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
60 #define OFFSET2(x) offsetof(DnnDetectContext, x)
61 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
62 static const AVOption dnn_detect_options[] = {
63  { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, "backend" },
64 #if (CONFIG_LIBTENSORFLOW == 1)
65  { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, "backend" },
66 #endif
67 #if (CONFIG_LIBOPENVINO == 1)
68  { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, "backend" },
69 #endif
71  { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS},
72  { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
73  { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, "model_type" },
74  { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, "model_type" },
75  { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, "model_type" },
76  { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
77  { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
78  { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
79  { "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
80  { NULL }
81 };
82 
83 AVFILTER_DEFINE_CLASS(dnn_detect);
84 
85 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
86 {
87  float max_prob = 0;
88  int label_id = 0;
89  for (int i = 0; i < nb_classes; i++) {
90  if (label_data[i * cell_size] > max_prob) {
91  max_prob = label_data[i * cell_size];
92  label_id = i;
93  }
94  }
95  return label_id;
96 }
97 
98 static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
99 {
100  char *saveptr = NULL, *token;
101  float *anchors_buf;
102  int nb_anchor = 0, i = 0;
103  while(anchors_str[i] != '\0') {
104  if(anchors_str[i] == '&')
105  nb_anchor++;
106  i++;
107  }
108  nb_anchor++;
109  anchors_buf = av_mallocz(nb_anchor * sizeof(*anchors));
110  if (!anchors_buf) {
111  return 0;
112  }
113  for (int i = 0; i < nb_anchor; i++) {
114  token = av_strtok(anchors_str, "&", &saveptr);
115  anchors_buf[i] = strtof(token, NULL);
116  anchors_str = NULL;
117  }
118  *anchors = anchors_buf;
119  return nb_anchor;
120 }
121 
122 /* Calculate Intersection Over Union */
123 static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
124 {
125  float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);
126  float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);
127  float intersection_area =
128  (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
129  float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;
130  return intersection_area / union_area;
131 }
132 
133 static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,
135 {
137  float conf_threshold = ctx->confidence;
138  int detection_boxes, box_size, cell_w, cell_h, scale_w, scale_h;
139  int nb_classes = ctx->nb_classes;
140  float *output_data = output[output_index].data;
141  float *anchors = ctx->anchors;
142  AVDetectionBBox *bbox;
143 
144  if (ctx->model_type == DDMT_YOLOV1V2) {
145  cell_w = ctx->cell_w;
146  cell_h = ctx->cell_h;
147  scale_w = cell_w;
148  scale_h = cell_h;
149  }
150  box_size = nb_classes + 5;
151 
152  if (!cell_h || !cell_w) {
153  av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
154  return AVERROR(EINVAL);
155  }
156 
157  if (!nb_classes) {
158  av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");
159  return AVERROR(EINVAL);
160  }
161 
162  if (!anchors) {
163  av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n");
164  return AVERROR(EINVAL);
165  }
166 
167  if (output[output_index].channels * output[output_index].width *
168  output[output_index].height % (box_size * cell_w * cell_h)) {
169  av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
170  return AVERROR(EINVAL);
171  }
172  detection_boxes = output[output_index].channels *
173  output[output_index].height *
174  output[output_index].width / box_size / cell_w / cell_h;
175 
176  /**
177  * find all candidate bbox
178  * yolo output can be reshaped to [B, N*D, Cx, Cy]
179  * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
180  **/
181  for (int box_id = 0; box_id < detection_boxes; box_id++) {
182  for (int cx = 0; cx < cell_w; cx++)
183  for (int cy = 0; cy < cell_h; cy++) {
184  float x, y, w, h, conf;
185  float *detection_boxes_data;
186  int label_id;
187 
188  detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
189  conf = detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h];
190  if (conf < conf_threshold) {
191  continue;
192  }
193 
194  x = detection_boxes_data[cy * cell_w + cx];
195  y = detection_boxes_data[cy * cell_w + cx + cell_w * cell_h];
196  w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
197  h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
198  label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
199  detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
200  conf = conf * detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h];
201 
202  bbox = av_mallocz(sizeof(*bbox));
203  if (!bbox)
204  return AVERROR(ENOMEM);
205 
206  bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;
207  bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;
208  bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;
209  bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;
210  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
211  if (ctx->labels && label_id < ctx->label_count) {
212  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
213  } else {
214  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
215  }
216 
217  if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {
218  av_freep(&bbox);
219  return AVERROR(ENOMEM);
220  }
221  }
222  }
223  return 0;
224 }
225 
227 {
229  float conf_threshold = ctx->confidence;
230  AVDetectionBBox *bbox;
231  int nb_bboxes = 0;
233  if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {
234  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
235  return 0;
236  }
237 
238  /* remove overlap bboxes */
239  for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){
240  av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);
241  for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {
242  AVDetectionBBox *overlap_bbox;
243  av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);
244  if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&
245  av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&
246  dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {
247  bbox->classify_count = -1; // bad result
248  nb_bboxes++;
249  break;
250  }
251  }
252  }
253  nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;
255  if (!header) {
256  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
257  return -1;
258  }
259  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
260 
261  while(av_fifo_can_read(ctx->bboxes_fifo)) {
262  AVDetectionBBox *candidate_bbox;
263  av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);
264 
265  if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {
266  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
267  memcpy(bbox, candidate_bbox, sizeof(*bbox));
268  nb_bboxes--;
269  }
270  av_freep(&candidate_bbox);
271  }
272  return 0;
273 }
274 
276 {
277  int ret = 0;
279  if (ret < 0)
280  return ret;
282  if (ret < 0)
283  return ret;
284  return 0;
285 }
286 
288 {
290  float conf_threshold = ctx->confidence;
291  int proposal_count = output->height;
292  int detect_size = output->width;
293  float *detections = output->data;
294  int nb_bboxes = 0;
296  AVDetectionBBox *bbox;
297 
298  if (output->width != 7) {
299  av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
300  return AVERROR(EINVAL);
301  }
302 
303  for (int i = 0; i < proposal_count; ++i) {
304  float conf = detections[i * detect_size + 2];
305  if (conf < conf_threshold) {
306  continue;
307  }
308  nb_bboxes++;
309  }
310 
311  if (nb_bboxes == 0) {
312  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
313  return 0;
314  }
315 
317  if (!header) {
318  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
319  return -1;
320  }
321 
322  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
323 
324  for (int i = 0; i < proposal_count; ++i) {
325  int av_unused image_id = (int)detections[i * detect_size + 0];
326  int label_id = (int)detections[i * detect_size + 1];
327  float conf = detections[i * detect_size + 2];
328  float x0 = detections[i * detect_size + 3];
329  float y0 = detections[i * detect_size + 4];
330  float x1 = detections[i * detect_size + 5];
331  float y1 = detections[i * detect_size + 6];
332 
333  if (conf < conf_threshold) {
334  continue;
335  }
336 
337  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
338  bbox->x = (int)(x0 * frame->width);
339  bbox->w = (int)(x1 * frame->width) - bbox->x;
340  bbox->y = (int)(y0 * frame->height);
341  bbox->h = (int)(y1 * frame->height) - bbox->y;
342 
343  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
344  bbox->classify_count = 0;
345 
346  if (ctx->labels && label_id < ctx->label_count) {
347  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
348  } else {
349  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
350  }
351 
352  nb_bboxes--;
353  if (nb_bboxes == 0) {
354  break;
355  }
356  }
357 
358  return 0;
359 }
360 
362 {
363  AVFrameSideData *sd;
365  int ret = 0;
366 
368  if (sd) {
369  av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n");
370  return -1;
371  }
372 
373  switch (ctx->model_type) {
374  case DDMT_SSD:
376  if (ret < 0)
377  return ret;
378  break;
379  case DDMT_YOLOV1V2:
381  if (ret < 0)
382  return ret;
383  }
384 
385  return 0;
386 }
387 
389 {
391  int proposal_count;
392  float conf_threshold = ctx->confidence;
393  float *conf, *position, *label_id, x0, y0, x1, y1;
394  int nb_bboxes = 0;
395  AVFrameSideData *sd;
396  AVDetectionBBox *bbox;
398 
399  proposal_count = *(float *)(output[0].data);
400  conf = output[1].data;
401  position = output[3].data;
402  label_id = output[2].data;
403 
405  if (sd) {
406  av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");
407  return -1;
408  }
409 
410  for (int i = 0; i < proposal_count; ++i) {
411  if (conf[i] < conf_threshold)
412  continue;
413  nb_bboxes++;
414  }
415 
416  if (nb_bboxes == 0) {
417  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
418  return 0;
419  }
420 
422  if (!header) {
423  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
424  return -1;
425  }
426 
427  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
428 
429  for (int i = 0; i < proposal_count; ++i) {
430  y0 = position[i * 4];
431  x0 = position[i * 4 + 1];
432  y1 = position[i * 4 + 2];
433  x1 = position[i * 4 + 3];
434 
435  bbox = av_get_detection_bbox(header, i);
436 
437  if (conf[i] < conf_threshold) {
438  continue;
439  }
440 
441  bbox->x = (int)(x0 * frame->width);
442  bbox->w = (int)(x1 * frame->width) - bbox->x;
443  bbox->y = (int)(y0 * frame->height);
444  bbox->h = (int)(y1 * frame->height) - bbox->y;
445 
446  bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);
447  bbox->classify_count = 0;
448 
449  if (ctx->labels && label_id[i] < ctx->label_count) {
450  av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));
451  } else {
452  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);
453  }
454 
455  nb_bboxes--;
456  if (nb_bboxes == 0) {
457  break;
458  }
459  }
460  return 0;
461 }
462 
464 {
466  DnnContext *dnn_ctx = &ctx->dnnctx;
467  switch (dnn_ctx->backend_type) {
468  case DNN_OV:
470  case DNN_TF:
472  default:
473  avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");
474  return AVERROR(EINVAL);
475  }
476 }
477 
479 {
480  for (int i = 0; i < ctx->label_count; i++) {
481  av_freep(&ctx->labels[i]);
482  }
483  ctx->label_count = 0;
484  av_freep(&ctx->labels);
485 }
486 
488 {
489  int line_len;
490  FILE *file;
491  DnnDetectContext *ctx = context->priv;
492 
493  file = avpriv_fopen_utf8(ctx->labels_filename, "r");
494  if (!file){
495  av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
496  return AVERROR(EINVAL);
497  }
498 
499  while (!feof(file)) {
500  char *label;
501  char buf[256];
502  if (!fgets(buf, 256, file)) {
503  break;
504  }
505 
506  line_len = strlen(buf);
507  while (line_len) {
508  int i = line_len - 1;
509  if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
510  buf[i] = '\0';
511  line_len--;
512  } else {
513  break;
514  }
515  }
516 
517  if (line_len == 0) // empty line
518  continue;
519 
520  if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
521  av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
522  fclose(file);
523  return AVERROR(EINVAL);
524  }
525 
526  label = av_strdup(buf);
527  if (!label) {
528  av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
529  fclose(file);
530  return AVERROR(ENOMEM);
531  }
532 
533  if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
534  av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
535  fclose(file);
536  av_freep(&label);
537  return AVERROR(ENOMEM);
538  }
539  }
540 
541  fclose(file);
542  return 0;
543 }
544 
545 static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
546 {
547  switch(backend_type) {
548  case DNN_TF:
549  if (output_nb != 4) {
550  av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \
551  but get %d instead\n", output_nb);
552  return AVERROR(EINVAL);
553  }
554  return 0;
555  case DNN_OV:
556  if (output_nb != 1) {
557  av_log(ctx, AV_LOG_ERROR, "Dnn detect filter with openvino backend needs 1 output only, \
558  but get %d instead\n", output_nb);
559  return AVERROR(EINVAL);
560  }
561  return 0;
562  default:
563  avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");
564  return AVERROR(EINVAL);
565  }
566  return 0;
567 }
568 
570 {
571  DnnDetectContext *ctx = context->priv;
572  DnnContext *dnn_ctx = &ctx->dnnctx;
573  int ret;
574 
576  if (ret < 0)
577  return ret;
578  ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);
579  if (ret < 0)
580  return ret;
581  ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);
582  if (!ctx->bboxes_fifo)
583  return AVERROR(ENOMEM);
585 
586  if (ctx->labels_filename) {
588  }
589  if (ctx->anchors_str) {
590  ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors);
591  if (!ctx->anchors) {
592  av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n");
593  return AVERROR(EINVAL);
594  }
595  ctx->nb_anchor = ret;
596  }
597  return 0;
598 }
599 
600 static const enum AVPixelFormat pix_fmts[] = {
607 };
608 
609 static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
610 {
611  DnnDetectContext *ctx = outlink->src->priv;
612  int ret;
613  DNNAsyncStatusType async_state;
614 
615  ret = ff_dnn_flush(&ctx->dnnctx);
616  if (ret != 0) {
617  return -1;
618  }
619 
620  do {
621  AVFrame *in_frame = NULL;
622  AVFrame *out_frame = NULL;
623  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
624  if (async_state == DAST_SUCCESS) {
625  ret = ff_filter_frame(outlink, in_frame);
626  if (ret < 0)
627  return ret;
628  if (out_pts)
629  *out_pts = in_frame->pts + pts;
630  }
631  av_usleep(5000);
632  } while (async_state >= DAST_NOT_READY);
633 
634  return 0;
635 }
636 
638 {
639  AVFilterLink *inlink = filter_ctx->inputs[0];
640  AVFilterLink *outlink = filter_ctx->outputs[0];
642  AVFrame *in = NULL;
643  int64_t pts;
644  int ret, status;
645  int got_frame = 0;
646  int async_state;
647 
649 
650  do {
651  // drain all input frames
653  if (ret < 0)
654  return ret;
655  if (ret > 0) {
656  if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) {
657  return AVERROR(EIO);
658  }
659  }
660  } while (ret > 0);
661 
662  // drain all processed frames
663  do {
664  AVFrame *in_frame = NULL;
665  AVFrame *out_frame = NULL;
666  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
667  if (async_state == DAST_SUCCESS) {
668  ret = ff_filter_frame(outlink, in_frame);
669  if (ret < 0)
670  return ret;
671  got_frame = 1;
672  }
673  } while (async_state == DAST_SUCCESS);
674 
675  // if frame got, schedule to next filter
676  if (got_frame)
677  return 0;
678 
680  if (status == AVERROR_EOF) {
681  int64_t out_pts = pts;
682  ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
683  ff_outlink_set_status(outlink, status, out_pts);
684  return ret;
685  }
686  }
687 
689 
690  return 0;
691 }
692 
694 {
695  DnnDetectContext *ctx = context->priv;
696  AVDetectionBBox *bbox;
697  ff_dnn_uninit(&ctx->dnnctx);
698  while(av_fifo_can_read(ctx->bboxes_fifo)) {
699  av_fifo_read(ctx->bboxes_fifo, &bbox, 1);
700  av_freep(&bbox);
701  }
702  av_fifo_freep2(&ctx->bboxes_fifo);
703  av_freep(&ctx->anchors);
705 }
706 
708  .name = "dnn_detect",
709  .description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
710  .priv_size = sizeof(DnnDetectContext),
716  .priv_class = &dnn_detect_class,
717  .activate = dnn_detect_activate,
718 };
pix_fmts
static enum AVPixelFormat pix_fmts[]
Definition: vf_dnn_detect.c:600
DnnDetectContext::nb_classes
int nb_classes
Definition: vf_dnn_detect.c:50
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
dnn_detect_parse_yolo_output
static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:133
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
av_frame_get_side_data
AVFrameSideData * av_frame_get_side_data(const AVFrame *frame, enum AVFrameSideDataType type)
Definition: frame.c:824
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:978
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FILTER_PIXFMTS_ARRAY
#define FILTER_PIXFMTS_ARRAY(array)
Definition: internal.h:172
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_unused
#define av_unused
Definition: attributes.h:131
av_fifo_peek
int av_fifo_peek(const AVFifo *f, void *buf, size_t nb_elems, size_t offset)
Read data from a FIFO without modifying FIFO state.
Definition: fifo.c:255
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:340
AVFrame::pts
int64_t pts
Presentation timestamp in time_base units (time when frame should be shown to user).
Definition: frame.h:452
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(dnn_detect)
AVFrame::width
int width
Definition: frame.h:412
w
uint8_t w
Definition: llviddspenc.c:38
read_detect_label_file
static int read_detect_label_file(AVFilterContext *context)
Definition: vf_dnn_detect.c:487
AVOption
AVOption.
Definition: opt.h:251
data
const char data[16]
Definition: mxf.c:148
dnn_detect_init
static av_cold int dnn_detect_init(AVFilterContext *context)
Definition: vf_dnn_detect.c:569
output_data
static int output_data(MLPDecodeContext *m, unsigned int substr, AVFrame *frame, int *got_frame_ptr)
Write the audio data into the output buffer.
Definition: mlpdec.c:1109
AV_LOG_VERBOSE
#define AV_LOG_VERBOSE
Detailed information.
Definition: log.h:196
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:170
dnn_filter_common.h
AVDetectionBBox::y
int y
Definition: detection_bbox.h:32
video.h
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:199
dnn_detect_post_proc_ov
static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:361
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1393
fifo.h
AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
#define AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
Definition: detection_bbox.h:36
AVDetectionBBox::detect_label
char detect_label[AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE]
Detect result with confidence.
Definition: detection_bbox.h:41
AVFilterContext::priv
void * priv
private data for use by the filter
Definition: avfilter.h:412
av_fifo_write
int av_fifo_write(AVFifo *f, const void *buf, size_t nb_elems)
Write data into a FIFO.
Definition: fifo.c:188
DnnContext
Definition: dnn_filter_common.h:29
dnn_detect_IOU
static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
Definition: vf_dnn_detect.c:123
filter_ctx
static FilteringContext * filter_ctx
Definition: transcode.c:51
dnn_detect_uninit
static av_cold void dnn_detect_uninit(AVFilterContext *context)
Definition: vf_dnn_detect.c:693
DnnDetectContext
Definition: vf_dnn_detect.c:40
pts
static int64_t pts
Definition: transcode_aac.c:643
DnnDetectContext::model_type
DNNDetectionModelType model_type
Definition: vf_dnn_detect.c:47
av_get_detection_bbox
static av_always_inline AVDetectionBBox * av_get_detection_bbox(const AVDetectionBBoxHeader *header, unsigned int idx)
Definition: detection_bbox.h:84
DnnDetectContext::scale_height
int scale_height
Definition: vf_dnn_detect.c:53
DNN_TF
@ DNN_TF
Definition: dnn_interface.h:35
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
av_fifo_read
int av_fifo_read(AVFifo *f, void *buf, size_t nb_elems)
Read data from a FIFO.
Definition: fifo.c:240
ff_video_default_filterpad
const AVFilterPad ff_video_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_VIDEO.
Definition: video.c:36
DnnDetectContext::bboxes_fifo
AVFifo * bboxes_fifo
Definition: vf_dnn_detect.c:51
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:189
width
#define width
ff_dnn_set_detect_post_proc
int ff_dnn_set_detect_post_proc(DnnContext *ctx, DetectPostProc post_proc)
Definition: dnn_filter_common.c:97
av_strtok
char * av_strtok(char *s, const char *delim, char **saveptr)
Split the string into several tokens which can be accessed by successive calls to av_strtok().
Definition: avstring.c:178
init
int(* init)(AVBSFContext *ctx)
Definition: dts2pts_bsf.c:365
free_detect_labels
static void free_detect_labels(DnnDetectContext *ctx)
Definition: vf_dnn_detect.c:478
DNNData
Definition: dnn_interface.h:65
filters.h
ff_dnn_get_result
DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame)
Definition: dnn_filter_common.c:147
ctx
AVFormatContext * ctx
Definition: movenc.c:48
channels
channels
Definition: aptx.h:31
ff_vf_dnn_detect
const AVFilter ff_vf_dnn_detect
Definition: vf_dnn_detect.c:707
DnnDetectContext::scale_width
int scale_width
Definition: vf_dnn_detect.c:52
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:66
av_usleep
int av_usleep(unsigned usec)
Sleep for a period of time.
Definition: time.c:84
AV_PIX_FMT_GRAYF32
#define AV_PIX_FMT_GRAYF32
Definition: pixfmt.h:501
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: internal.h:192
file_open.h
frame
static AVFrame * frame
Definition: demux_decode.c:54
DNN_OV
@ DNN_OV
Definition: dnn_interface.h:35
dnn_detect_post_proc_ssd
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:287
context
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are in without and describe what they for example set the foo of the bar offset is the offset of the field in your context
Definition: writing_filters.txt:91
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:66
NULL
#define NULL
Definition: coverity.c:32
AVDetectionBBoxHeader
Definition: detection_bbox.h:56
DnnDetectContext::dnnctx
DnnContext dnnctx
Definition: vf_dnn_detect.c:42
DnnDetectContext::cell_h
int cell_h
Definition: vf_dnn_detect.c:49
dnn_detect_activate
static int dnn_detect_activate(AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:637
DnnDetectContext::labels_filename
char * labels_filename
Definition: vf_dnn_detect.c:44
av_fifo_can_read
size_t av_fifo_can_read(const AVFifo *f)
Definition: fifo.c:87
DnnDetectContext::labels
char ** labels
Definition: vf_dnn_detect.c:45
time.h
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:74
exp
int8_t exp
Definition: eval.c:72
ff_dnn_flush
int ff_dnn_flush(DnnContext *ctx)
Definition: dnn_filter_common.c:152
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1347
FLAGS
#define FLAGS
Definition: vf_dnn_detect.c:61
av_detection_bbox_create_side_data
AVDetectionBBoxHeader * av_detection_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
Allocates memory for AVDetectionBBoxHeader, plus an array of.
Definition: detection_bbox.c:51
DNN_COMMON_OPTIONS
#define DNN_COMMON_OPTIONS
Definition: dnn_filter_common.h:43
DnnContext::backend_type
DNNBackendType backend_type
Definition: dnn_filter_common.h:31
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
AVFifo
Definition: fifo.c:35
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:106
DnnDetectContext::label_count
int label_count
Definition: vf_dnn_detect.c:46
DAST_SUCCESS
@ DAST_SUCCESS
Definition: dnn_interface.h:49
AVDetectionBBox::w
int w
Definition: detection_bbox.h:33
DNNBackendType
DNNBackendType
Definition: dnn_interface.h:35
DnnContext::nb_outputs
uint32_t nb_outputs
Definition: dnn_filter_common.h:38
av_make_q
static AVRational av_make_q(int num, int den)
Create an AVRational.
Definition: rational.h:71
avpriv_report_missing_feature
void avpriv_report_missing_feature(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
header
static const uint8_t header[24]
Definition: sdr2.c:67
AVDetectionBBox::classify_count
uint32_t classify_count
Definition: detection_bbox.h:51
height
#define height
DDMT_YOLOV1V2
@ DDMT_YOLOV1V2
Definition: vf_dnn_detect.c:37
DnnDetectContext::anchors_str
char * anchors_str
Definition: vf_dnn_detect.c:54
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
dnn_detect_options
static const AVOption dnn_detect_options[]
Definition: vf_dnn_detect.c:62
internal.h
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Definition: opt.h:228
dnn_detect_flush_frame
static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
Definition: vf_dnn_detect.c:609
dnn_detect_post_proc
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:463
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:245
DFT_ANALYTICS_DETECT
@ DFT_ANALYTICS_DETECT
Definition: dnn_interface.h:55
check_output_nb
static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
Definition: vf_dnn_detect.c:545
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:254
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:159
dnn_detect_post_proc_yolo
static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:275
DnnDetectContext::confidence
float confidence
Definition: vf_dnn_detect.c:43
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
AVFilter
Filter definition.
Definition: avfilter.h:166
ret
ret
Definition: filter_design.txt:187
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:89
AVDetectionBBox::h
int h
Definition: detection_bbox.h:34
av_fifo_alloc2
AVFifo * av_fifo_alloc2(size_t nb_elems, size_t elem_size, unsigned int flags)
Allocate and initialize an AVFifo with a given element size.
Definition: fifo.c:47
AVDetectionBBox::detect_confidence
AVRational detect_confidence
Definition: detection_bbox.h:42
av_dynarray_add_nofree
int av_dynarray_add_nofree(void *tab_ptr, int *nb_ptr, void *elem)
Add an element to a dynamic array.
Definition: mem.c:313
AVFrame::height
int height
Definition: frame.h:412
DDMT_SSD
@ DDMT_SSD
Definition: vf_dnn_detect.c:36
status
ov_status_e status
Definition: dnn_backend_openvino.c:119
DNNDetectionModelType
DNNDetectionModelType
Definition: vf_dnn_detect.c:35
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:65
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Definition: opt.h:225
AVDetectionBBox::x
int x
Distance in pixels from the left/top edge of the frame, together with width and height,...
Definition: detection_bbox.h:31
DnnDetectContext::nb_anchor
int nb_anchor
Definition: vf_dnn_detect.c:56
AV_PIX_FMT_YUV444P
@ AV_PIX_FMT_YUV444P
planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
Definition: pixfmt.h:71
AVFilterContext
An instance of a filter.
Definition: avfilter.h:397
av_strdup
char * av_strdup(const char *s)
Duplicate a string.
Definition: mem.c:270
AV_PIX_FMT_YUV422P
@ AV_PIX_FMT_YUV422P
planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:70
OFFSET
#define OFFSET(x)
Definition: vf_dnn_detect.c:59
dnn_detect_fill_side_data
static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:226
AVFrameSideData
Structure to hold side data for an AVFrame.
Definition: frame.h:246
DnnDetectContext::cell_w
int cell_w
Definition: vf_dnn_detect.c:48
DnnDetectContext::anchors
float * anchors
Definition: vf_dnn_detect.c:55
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: internal.h:193
ff_dnn_init
int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx)
Definition: dnn_filter_common.c:54
dnn_detect_get_label_id
static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
Definition: vf_dnn_detect.c:85
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
AV_PIX_FMT_YUV411P
@ AV_PIX_FMT_YUV411P
planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
Definition: pixfmt.h:73
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:72
av_strlcpy
size_t av_strlcpy(char *dst, const char *src, size_t size)
Copy the string src to dst, but no more than size - 1 bytes, and null-terminate dst.
Definition: avstring.c:85
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
av_fifo_freep2
void av_fifo_freep2(AVFifo **f)
Free an AVFifo and reset pointer to NULL.
Definition: fifo.c:286
ff_dnn_uninit
void ff_dnn_uninit(DnnContext *ctx)
Definition: dnn_filter_common.c:157
AVDetectionBBox
Definition: detection_bbox.h:26
uninit
static av_cold int uninit(AVCodecContext *avctx)
Definition: crystalhd.c:285
h
h
Definition: vp9dsp_template.c:2038
dnn_detect_post_proc_tf
static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:388
ff_dnn_execute_model
int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame)
Definition: dnn_filter_common.c:120
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Definition: opt.h:229
dnn_detect_parse_anchors
static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
Definition: vf_dnn_detect.c:98
DAST_NOT_READY
@ DAST_NOT_READY
Definition: dnn_interface.h:48
int
int
Definition: ffmpeg_filter.c:368
DNNAsyncStatusType
DNNAsyncStatusType
Definition: dnn_interface.h:45
AV_OPT_TYPE_CONST
@ AV_OPT_TYPE_CONST
Definition: opt.h:234
snprintf
#define snprintf
Definition: snprintf.h:34
OFFSET2
#define OFFSET2(x)
Definition: vf_dnn_detect.c:60
detection_bbox.h
AV_FIFO_FLAG_AUTO_GROW
#define AV_FIFO_FLAG_AUTO_GROW
Automatically resize the FIFO on writes, so that the data fits.
Definition: fifo.h:67
AV_FRAME_DATA_DETECTION_BBOXES
@ AV_FRAME_DATA_DETECTION_BBOXES
Bounding boxes for object detection and classification, as described by AVDetectionBBoxHeader.
Definition: frame.h:190