FFmpeg
vf_dnn_detect.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /**
20  * @file
21  * implementing an object detecting filter using deep learning networks.
22  */
23 
24 #include "libavutil/file_open.h"
25 #include "libavutil/mem.h"
26 #include "libavutil/opt.h"
27 #include "filters.h"
28 #include "dnn_filter_common.h"
29 #include "internal.h"
30 #include "video.h"
31 #include "libavutil/time.h"
32 #include "libavutil/avstring.h"
34 #include "libavutil/fifo.h"
35 
36 typedef enum {
42 
43 typedef struct DnnDetectContext {
44  const AVClass *class;
46  float confidence;
48  char **labels;
51  int cell_w;
52  int cell_h;
57  char *anchors_str;
58  float *anchors;
59  int nb_anchor;
61 
62 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
63 #define OFFSET2(x) offsetof(DnnDetectContext, x)
64 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
65 static const AVOption dnn_detect_options[] = {
66  { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
67 #if (CONFIG_LIBTENSORFLOW == 1)
68  { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, .unit = "backend" },
69 #endif
70 #if (CONFIG_LIBOPENVINO == 1)
71  { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" },
72 #endif
74  { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS},
75  { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
76  { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, .unit = "model_type" },
77  { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, .unit = "model_type" },
78  { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, .unit = "model_type" },
79  { "yolov3", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV3 }, 0, 0, FLAGS, .unit = "model_type" },
80  { "yolov4", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV4 }, 0, 0, FLAGS, .unit = "model_type" },
81  { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
82  { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
83  { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
84  { "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
85  { NULL }
86 };
87 
88 AVFILTER_DEFINE_CLASS(dnn_detect);
89 
90 static inline float sigmoid(float x) {
91  return 1.f / (1.f + exp(-x));
92 }
93 
94 static inline float linear(float x) {
95  return x;
96 }
97 
98 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
99 {
100  float max_prob = 0;
101  int label_id = 0;
102  for (int i = 0; i < nb_classes; i++) {
103  if (label_data[i * cell_size] > max_prob) {
104  max_prob = label_data[i * cell_size];
105  label_id = i;
106  }
107  }
108  return label_id;
109 }
110 
111 static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
112 {
113  char *saveptr = NULL, *token;
114  float *anchors_buf;
115  int nb_anchor = 0, i = 0;
116  while(anchors_str[i] != '\0') {
117  if(anchors_str[i] == '&')
118  nb_anchor++;
119  i++;
120  }
121  nb_anchor++;
122  anchors_buf = av_mallocz(nb_anchor * sizeof(**anchors));
123  if (!anchors_buf) {
124  return 0;
125  }
126  for (int i = 0; i < nb_anchor; i++) {
127  token = av_strtok(anchors_str, "&", &saveptr);
128  if (!token) {
129  av_freep(&anchors_buf);
130  return 0;
131  }
132  anchors_buf[i] = strtof(token, NULL);
133  anchors_str = NULL;
134  }
135  *anchors = anchors_buf;
136  return nb_anchor;
137 }
138 
139 /* Calculate Intersection Over Union */
140 static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
141 {
142  float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);
143  float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);
144  float intersection_area =
145  (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
146  float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;
147  return intersection_area / union_area;
148 }
149 
150 static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,
152 {
154  float conf_threshold = ctx->confidence;
155  int detection_boxes, box_size;
156  int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;
157  int nb_classes = ctx->nb_classes;
158  float *output_data = output[output_index].data;
159  float *anchors = ctx->anchors;
160  AVDetectionBBox *bbox;
161  float (*post_process_raw_data)(float x) = linear;
162  int is_NHWC = 0;
163 
164  if (ctx->model_type == DDMT_YOLOV1V2) {
165  cell_w = ctx->cell_w;
166  cell_h = ctx->cell_h;
167  scale_w = cell_w;
168  scale_h = cell_h;
169  } else {
170  if (output[output_index].dims[2] != output[output_index].dims[3] &&
171  output[output_index].dims[2] == output[output_index].dims[1]) {
172  is_NHWC = 1;
173  cell_w = output[output_index].dims[2];
174  cell_h = output[output_index].dims[1];
175  } else {
176  cell_w = output[output_index].dims[3];
177  cell_h = output[output_index].dims[2];
178  }
179  scale_w = ctx->scale_width;
180  scale_h = ctx->scale_height;
181  }
182  box_size = nb_classes + 5;
183 
184  switch (ctx->model_type) {
185  case DDMT_YOLOV1V2:
186  case DDMT_YOLOV3:
187  post_process_raw_data = linear;
188  break;
189  case DDMT_YOLOV4:
190  post_process_raw_data = sigmoid;
191  break;
192  }
193 
194  if (!cell_h || !cell_w) {
195  av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
196  return AVERROR(EINVAL);
197  }
198 
199  if (!nb_classes) {
200  av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");
201  return AVERROR(EINVAL);
202  }
203 
204  if (!anchors) {
205  av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n");
206  return AVERROR(EINVAL);
207  }
208 
209  if (output[output_index].dims[1] * output[output_index].dims[2] *
210  output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
211  av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
212  return AVERROR(EINVAL);
213  }
214  detection_boxes = output[output_index].dims[1] *
215  output[output_index].dims[2] *
216  output[output_index].dims[3] / box_size / cell_w / cell_h;
217 
218  anchors = anchors + (detection_boxes * output_index * 2);
219  /**
220  * find all candidate bbox
221  * yolo output can be reshaped to [B, N*D, Cx, Cy]
222  * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
223  **/
224  for (int box_id = 0; box_id < detection_boxes; box_id++) {
225  for (int cx = 0; cx < cell_w; cx++)
226  for (int cy = 0; cy < cell_h; cy++) {
227  float x, y, w, h, conf;
228  float *detection_boxes_data;
229  int label_id;
230 
231  if (is_NHWC) {
232  detection_boxes_data = output_data +
233  ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
234  conf = post_process_raw_data(detection_boxes_data[4]);
235  } else {
236  detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
237  conf = post_process_raw_data(
238  detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
239  }
240 
241  if (is_NHWC) {
242  x = post_process_raw_data(detection_boxes_data[0]);
243  y = post_process_raw_data(detection_boxes_data[1]);
244  w = detection_boxes_data[2];
245  h = detection_boxes_data[3];
246  label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5);
247  conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
248  } else {
249  x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
250  y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
251  w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
252  h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
253  label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
254  detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
255  conf = conf * post_process_raw_data(
256  detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
257  }
258  if (conf < conf_threshold) {
259  continue;
260  }
261 
262  bbox = av_mallocz(sizeof(*bbox));
263  if (!bbox)
264  return AVERROR(ENOMEM);
265 
266  bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;
267  bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;
268  bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;
269  bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;
270  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
271  if (ctx->labels && label_id < ctx->label_count) {
272  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
273  } else {
274  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
275  }
276 
277  if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {
278  av_freep(&bbox);
279  return AVERROR(ENOMEM);
280  }
281  bbox = NULL;
282  }
283  }
284  return 0;
285 }
286 
288 {
290  float conf_threshold = ctx->confidence;
291  AVDetectionBBox *bbox;
292  int nb_bboxes = 0;
294  if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {
295  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
296  return 0;
297  }
298 
299  /* remove overlap bboxes */
300  for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){
301  av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);
302  for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {
303  AVDetectionBBox *overlap_bbox;
304  av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);
305  if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&
306  av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&
307  dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {
308  bbox->classify_count = -1; // bad result
309  nb_bboxes++;
310  break;
311  }
312  }
313  }
314  nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;
316  if (!header) {
317  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
318  return -1;
319  }
320  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
321 
322  while(av_fifo_can_read(ctx->bboxes_fifo)) {
323  AVDetectionBBox *candidate_bbox;
324  av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);
325 
326  if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {
327  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
328  memcpy(bbox, candidate_bbox, sizeof(*bbox));
329  nb_bboxes--;
330  }
331  av_freep(&candidate_bbox);
332  }
333  return 0;
334 }
335 
337 {
338  int ret = 0;
340  if (ret < 0)
341  return ret;
343  if (ret < 0)
344  return ret;
345  return 0;
346 }
347 
349  AVFilterContext *filter_ctx, int nb_outputs)
350 {
351  int ret = 0;
352  for (int i = 0; i < nb_outputs; i++) {
354  if (ret < 0)
355  return ret;
356  }
358  if (ret < 0)
359  return ret;
360  return 0;
361 }
362 
363 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
365 {
367  float conf_threshold = ctx->confidence;
368  int proposal_count = 0;
369  int detect_size = 0;
370  float *detections = NULL, *labels = NULL;
371  int nb_bboxes = 0;
373  AVDetectionBBox *bbox;
374  int scale_w = ctx->scale_width;
375  int scale_h = ctx->scale_height;
376 
377  if (nb_outputs == 1 && output->dims[3] == 7) {
378  proposal_count = output->dims[2];
379  detect_size = output->dims[3];
380  detections = output->data;
381  } else if (nb_outputs == 2 && output[0].dims[3] == 5) {
382  proposal_count = output[0].dims[2];
383  detect_size = output[0].dims[3];
384  detections = output[0].data;
385  labels = output[1].data;
386  } else if (nb_outputs == 2 && output[1].dims[3] == 5) {
387  proposal_count = output[1].dims[2];
388  detect_size = output[1].dims[3];
389  detections = output[1].data;
390  labels = output[0].data;
391  } else {
392  av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
393  return AVERROR(EINVAL);
394  }
395 
396  if (proposal_count == 0)
397  return 0;
398 
399  for (int i = 0; i < proposal_count; ++i) {
400  float conf;
401  if (nb_outputs == 1)
402  conf = detections[i * detect_size + 2];
403  else
404  conf = detections[i * detect_size + 4];
405  if (conf < conf_threshold) {
406  continue;
407  }
408  nb_bboxes++;
409  }
410 
411  if (nb_bboxes == 0) {
412  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
413  return 0;
414  }
415 
417  if (!header) {
418  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
419  return -1;
420  }
421 
422  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
423 
424  for (int i = 0; i < proposal_count; ++i) {
425  int av_unused image_id = (int)detections[i * detect_size + 0];
426  int label_id;
427  float conf, x0, y0, x1, y1;
428 
429  if (nb_outputs == 1) {
430  label_id = (int)detections[i * detect_size + 1];
431  conf = detections[i * detect_size + 2];
432  x0 = detections[i * detect_size + 3];
433  y0 = detections[i * detect_size + 4];
434  x1 = detections[i * detect_size + 5];
435  y1 = detections[i * detect_size + 6];
436  } else {
437  label_id = (int)labels[i];
438  x0 = detections[i * detect_size] / scale_w;
439  y0 = detections[i * detect_size + 1] / scale_h;
440  x1 = detections[i * detect_size + 2] / scale_w;
441  y1 = detections[i * detect_size + 3] / scale_h;
442  conf = detections[i * detect_size + 4];
443  }
444 
445  if (conf < conf_threshold) {
446  continue;
447  }
448 
449  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
450  bbox->x = (int)(x0 * frame->width);
451  bbox->w = (int)(x1 * frame->width) - bbox->x;
452  bbox->y = (int)(y0 * frame->height);
453  bbox->h = (int)(y1 * frame->height) - bbox->y;
454 
455  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
456  bbox->classify_count = 0;
457 
458  if (ctx->labels && label_id < ctx->label_count) {
459  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
460  } else {
461  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
462  }
463 
464  nb_bboxes--;
465  if (nb_bboxes == 0) {
466  break;
467  }
468  }
469  return 0;
470 }
471 
472 static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs,
474 {
475  AVFrameSideData *sd;
477  int ret = 0;
478 
480  if (sd) {
481  av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n");
482  return -1;
483  }
484 
485  switch (ctx->model_type) {
486  case DDMT_SSD:
488  if (ret < 0)
489  return ret;
490  break;
491  case DDMT_YOLOV1V2:
493  if (ret < 0)
494  return ret;
495  break;
496  case DDMT_YOLOV3:
497  case DDMT_YOLOV4:
499  if (ret < 0)
500  return ret;
501  break;
502  }
503  return 0;
504 }
505 
507 {
509  int proposal_count;
510  float conf_threshold = ctx->confidence;
511  float *conf, *position, *label_id, x0, y0, x1, y1;
512  int nb_bboxes = 0;
513  AVFrameSideData *sd;
514  AVDetectionBBox *bbox;
516 
517  proposal_count = *(float *)(output[0].data);
518  conf = output[1].data;
519  position = output[3].data;
520  label_id = output[2].data;
521 
523  if (sd) {
524  av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");
525  return -1;
526  }
527 
528  for (int i = 0; i < proposal_count; ++i) {
529  if (conf[i] < conf_threshold)
530  continue;
531  nb_bboxes++;
532  }
533 
534  if (nb_bboxes == 0) {
535  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
536  return 0;
537  }
538 
540  if (!header) {
541  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
542  return -1;
543  }
544 
545  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
546 
547  for (int i = 0; i < proposal_count; ++i) {
548  y0 = position[i * 4];
549  x0 = position[i * 4 + 1];
550  y1 = position[i * 4 + 2];
551  x1 = position[i * 4 + 3];
552 
553  bbox = av_get_detection_bbox(header, i);
554 
555  if (conf[i] < conf_threshold) {
556  continue;
557  }
558 
559  bbox->x = (int)(x0 * frame->width);
560  bbox->w = (int)(x1 * frame->width) - bbox->x;
561  bbox->y = (int)(y0 * frame->height);
562  bbox->h = (int)(y1 * frame->height) - bbox->y;
563 
564  bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);
565  bbox->classify_count = 0;
566 
567  if (ctx->labels && label_id[i] < ctx->label_count) {
568  av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));
569  } else {
570  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);
571  }
572 
573  nb_bboxes--;
574  if (nb_bboxes == 0) {
575  break;
576  }
577  }
578  return 0;
579 }
580 
582 {
584  DnnContext *dnn_ctx = &ctx->dnnctx;
585  switch (dnn_ctx->backend_type) {
586  case DNN_OV:
588  case DNN_TF:
590  default:
591  avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");
592  return AVERROR(EINVAL);
593  }
594 }
595 
597 {
598  for (int i = 0; i < ctx->label_count; i++) {
599  av_freep(&ctx->labels[i]);
600  }
601  ctx->label_count = 0;
602  av_freep(&ctx->labels);
603 }
604 
606 {
607  int line_len;
608  FILE *file;
609  DnnDetectContext *ctx = context->priv;
610 
611  file = avpriv_fopen_utf8(ctx->labels_filename, "r");
612  if (!file){
613  av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
614  return AVERROR(EINVAL);
615  }
616 
617  while (!feof(file)) {
618  char *label;
619  char buf[256];
620  if (!fgets(buf, 256, file)) {
621  break;
622  }
623 
624  line_len = strlen(buf);
625  while (line_len) {
626  int i = line_len - 1;
627  if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
628  buf[i] = '\0';
629  line_len--;
630  } else {
631  break;
632  }
633  }
634 
635  if (line_len == 0) // empty line
636  continue;
637 
638  if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
639  av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
640  fclose(file);
641  return AVERROR(EINVAL);
642  }
643 
644  label = av_strdup(buf);
645  if (!label) {
646  av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
647  fclose(file);
648  return AVERROR(ENOMEM);
649  }
650 
651  if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
652  av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
653  fclose(file);
654  av_freep(&label);
655  return AVERROR(ENOMEM);
656  }
657  }
658 
659  fclose(file);
660  return 0;
661 }
662 
663 static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
664 {
665  switch(backend_type) {
666  case DNN_TF:
667  if (output_nb != 4) {
668  av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \
669  but get %d instead\n", output_nb);
670  return AVERROR(EINVAL);
671  }
672  return 0;
673  case DNN_OV:
674  return 0;
675  default:
676  avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");
677  return AVERROR(EINVAL);
678  }
679  return 0;
680 }
681 
683 {
684  DnnDetectContext *ctx = context->priv;
685  DnnContext *dnn_ctx = &ctx->dnnctx;
686  int ret;
687 
689  if (ret < 0)
690  return ret;
691  ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);
692  if (ret < 0)
693  return ret;
694  ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);
695  if (!ctx->bboxes_fifo)
696  return AVERROR(ENOMEM);
698 
699  if (ctx->labels_filename) {
701  }
702  if (ctx->anchors_str) {
703  ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors);
704  if (!ctx->anchors) {
705  av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n");
706  return AVERROR(EINVAL);
707  }
708  ctx->nb_anchor = ret;
709  }
710  return 0;
711 }
712 
713 static const enum AVPixelFormat pix_fmts[] = {
720 };
721 
722 static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
723 {
724  DnnDetectContext *ctx = outlink->src->priv;
725  int ret;
726  DNNAsyncStatusType async_state;
727 
728  ret = ff_dnn_flush(&ctx->dnnctx);
729  if (ret != 0) {
730  return -1;
731  }
732 
733  do {
734  AVFrame *in_frame = NULL;
735  AVFrame *out_frame = NULL;
736  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
737  if (async_state == DAST_SUCCESS) {
738  ret = ff_filter_frame(outlink, in_frame);
739  if (ret < 0)
740  return ret;
741  if (out_pts)
742  *out_pts = in_frame->pts + pts;
743  }
744  av_usleep(5000);
745  } while (async_state >= DAST_NOT_READY);
746 
747  return 0;
748 }
749 
751 {
752  AVFilterLink *inlink = filter_ctx->inputs[0];
753  AVFilterLink *outlink = filter_ctx->outputs[0];
755  AVFrame *in = NULL;
756  int64_t pts;
757  int ret, status;
758  int got_frame = 0;
759  int async_state;
760 
762 
763  do {
764  // drain all input frames
766  if (ret < 0)
767  return ret;
768  if (ret > 0) {
769  if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) {
770  return AVERROR(EIO);
771  }
772  }
773  } while (ret > 0);
774 
775  // drain all processed frames
776  do {
777  AVFrame *in_frame = NULL;
778  AVFrame *out_frame = NULL;
779  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
780  if (async_state == DAST_SUCCESS) {
781  ret = ff_filter_frame(outlink, in_frame);
782  if (ret < 0)
783  return ret;
784  got_frame = 1;
785  }
786  } while (async_state == DAST_SUCCESS);
787 
788  // if frame got, schedule to next filter
789  if (got_frame)
790  return 0;
791 
793  if (status == AVERROR_EOF) {
794  int64_t out_pts = pts;
795  ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
796  ff_outlink_set_status(outlink, status, out_pts);
797  return ret;
798  }
799  }
800 
802 
803  return 0;
804 }
805 
807 {
808  DnnDetectContext *ctx = context->priv;
809  AVDetectionBBox *bbox;
810  ff_dnn_uninit(&ctx->dnnctx);
811  while(av_fifo_can_read(ctx->bboxes_fifo)) {
812  av_fifo_read(ctx->bboxes_fifo, &bbox, 1);
813  av_freep(&bbox);
814  }
815  av_fifo_freep2(&ctx->bboxes_fifo);
816  av_freep(&ctx->anchors);
818 }
819 
821 {
823  DnnDetectContext *ctx = context->priv;
824  DNNData model_input;
825  int ret, width_idx, height_idx;
826 
827  ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);
828  if (ret != 0) {
829  av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
830  return ret;
831  }
832  width_idx = dnn_get_width_idx_by_layout(model_input.layout);
833  height_idx = dnn_get_height_idx_by_layout(model_input.layout);
834  ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w :
835  model_input.dims[width_idx];
836  ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h :
837  model_input.dims[height_idx];
838 
839  return 0;
840 }
841 
842 static const AVFilterPad dnn_detect_inputs[] = {
843  {
844  .name = "default",
845  .type = AVMEDIA_TYPE_VIDEO,
846  .config_props = config_input,
847  },
848 };
849 
851  .name = "dnn_detect",
852  .description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
853  .priv_size = sizeof(DnnDetectContext),
859  .priv_class = &dnn_detect_class,
860  .activate = dnn_detect_activate,
861 };
pix_fmts
static enum AVPixelFormat pix_fmts[]
Definition: vf_dnn_detect.c:713
DnnDetectContext::nb_classes
int nb_classes
Definition: vf_dnn_detect.c:53
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
dnn_detect_parse_yolo_output
static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:150
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
av_frame_get_side_data
AVFrameSideData * av_frame_get_side_data(const AVFrame *frame, enum AVFrameSideDataType type)
Definition: frame.c:947
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1015
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FILTER_PIXFMTS_ARRAY
#define FILTER_PIXFMTS_ARRAY(array)
Definition: internal.h:162
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_unused
#define av_unused
Definition: attributes.h:131
av_fifo_peek
int av_fifo_peek(const AVFifo *f, void *buf, size_t nb_elems, size_t offset)
Read data from a FIFO without modifying FIFO state.
Definition: fifo.c:255
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:374
AVFrame::pts
int64_t pts
Presentation timestamp in time_base units (time when frame should be shown to user).
Definition: frame.h:486
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(dnn_detect)
w
uint8_t w
Definition: llviddspenc.c:38
read_detect_label_file
static int read_detect_label_file(AVFilterContext *context)
Definition: vf_dnn_detect.c:605
AVOption
AVOption.
Definition: opt.h:346
data
const char data[16]
Definition: mxf.c:148
dnn_detect_post_proc_ssd
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:363
dnn_detect_init
static av_cold int dnn_detect_init(AVFilterContext *context)
Definition: vf_dnn_detect.c:682
output_data
static int output_data(MLPDecodeContext *m, unsigned int substr, AVFrame *frame, int *got_frame_ptr)
Write the audio data into the output buffer.
Definition: mlpdec.c:1108
AV_LOG_VERBOSE
#define AV_LOG_VERBOSE
Detailed information.
Definition: log.h:196
dnn_detect_inputs
static const AVFilterPad dnn_detect_inputs[]
Definition: vf_dnn_detect.c:842
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:170
dnn_filter_common.h
AVDetectionBBox::y
int y
Definition: detection_bbox.h:32
video.h
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:199
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1442
fifo.h
AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
#define AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
Definition: detection_bbox.h:36
dnn_get_width_idx_by_layout
static int dnn_get_width_idx_by_layout(DNNLayout layout)
Definition: dnn_interface.h:137
AVDetectionBBox::detect_label
char detect_label[AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE]
Detect result with confidence.
Definition: detection_bbox.h:41
AVFilterContext::priv
void * priv
private data for use by the filter
Definition: avfilter.h:422
av_fifo_write
int av_fifo_write(AVFifo *f, const void *buf, size_t nb_elems)
Write data into a FIFO.
Definition: fifo.c:188
DnnContext
Definition: dnn_filter_common.h:29
dnn_detect_IOU
static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
Definition: vf_dnn_detect.c:140
filter_ctx
static FilteringContext * filter_ctx
Definition: transcode.c:52
dnn_detect_uninit
static av_cold void dnn_detect_uninit(AVFilterContext *context)
Definition: vf_dnn_detect.c:806
DnnDetectContext
Definition: vf_dnn_detect.c:43
pts
static int64_t pts
Definition: transcode_aac.c:644
DnnDetectContext::model_type
DNNDetectionModelType model_type
Definition: vf_dnn_detect.c:50
AVFilterPad
A filter pad used for either input or output.
Definition: internal.h:33
av_get_detection_bbox
static av_always_inline AVDetectionBBox * av_get_detection_bbox(const AVDetectionBBoxHeader *header, unsigned int idx)
Definition: detection_bbox.h:84
DnnDetectContext::scale_height
int scale_height
Definition: vf_dnn_detect.c:56
dnn_detect_post_proc_ov
static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:472
DNN_TF
@ DNN_TF
Definition: dnn_interface.h:35
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
av_fifo_read
int av_fifo_read(AVFifo *f, void *buf, size_t nb_elems)
Read data from a FIFO.
Definition: fifo.c:240
ff_video_default_filterpad
const AVFilterPad ff_video_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_VIDEO.
Definition: video.c:37
DnnDetectContext::bboxes_fifo
AVFifo * bboxes_fifo
Definition: vf_dnn_detect.c:54
float
float
Definition: af_crystalizer.c:121
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:189
ff_dnn_set_detect_post_proc
int ff_dnn_set_detect_post_proc(DnnContext *ctx, DetectPostProc post_proc)
Definition: dnn_filter_common.c:110
av_strtok
char * av_strtok(char *s, const char *delim, char **saveptr)
Split the string into several tokens which can be accessed by successive calls to av_strtok().
Definition: avstring.c:178
free_detect_labels
static void free_detect_labels(DnnDetectContext *ctx)
Definition: vf_dnn_detect.c:596
DNNData
Definition: dnn_interface.h:65
dnn_detect_post_proc_yolov3
static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx, int nb_outputs)
Definition: vf_dnn_detect.c:348
filters.h
ff_dnn_get_result
DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame)
Definition: dnn_filter_common.c:162
ctx
AVFormatContext * ctx
Definition: movenc.c:49
config_input
static int config_input(AVFilterLink *inlink)
Definition: vf_dnn_detect.c:820
linear
static float linear(float x)
Definition: vf_dnn_detect.c:94
ff_vf_dnn_detect
const AVFilter ff_vf_dnn_detect
Definition: vf_dnn_detect.c:850
DnnDetectContext::scale_width
int scale_width
Definition: vf_dnn_detect.c:55
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:73
av_usleep
int av_usleep(unsigned usec)
Sleep for a period of time.
Definition: time.c:84
AV_PIX_FMT_GRAYF32
#define AV_PIX_FMT_GRAYF32
Definition: pixfmt.h:511
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: internal.h:182
file_open.h
ff_dnn_get_input
int ff_dnn_get_input(DnnContext *ctx, DNNData *input)
Definition: dnn_filter_common.c:122
DNN_OV
@ DNN_OV
Definition: dnn_interface.h:35
context
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are in without and describe what they for example set the foo of the bar offset is the offset of the field in your context
Definition: writing_filters.txt:91
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:66
NULL
#define NULL
Definition: coverity.c:32
AVDetectionBBoxHeader
Definition: detection_bbox.h:56
DnnDetectContext::dnnctx
DnnContext dnnctx
Definition: vf_dnn_detect.c:45
DnnDetectContext::cell_h
int cell_h
Definition: vf_dnn_detect.c:52
dnn_detect_activate
static int dnn_detect_activate(AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:750
DnnDetectContext::labels_filename
char * labels_filename
Definition: vf_dnn_detect.c:47
av_fifo_can_read
size_t av_fifo_can_read(const AVFifo *f)
Definition: fifo.c:87
DnnDetectContext::labels
char ** labels
Definition: vf_dnn_detect.c:48
DDMT_YOLOV3
@ DDMT_YOLOV3
Definition: vf_dnn_detect.c:39
time.h
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:81
exp
int8_t exp
Definition: eval.c:73
ff_dnn_flush
int ff_dnn_flush(DnnContext *ctx)
Definition: dnn_filter_common.c:167
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1389
FLAGS
#define FLAGS
Definition: vf_dnn_detect.c:64
DDMT_YOLOV4
@ DDMT_YOLOV4
Definition: vf_dnn_detect.c:40
av_detection_bbox_create_side_data
AVDetectionBBoxHeader * av_detection_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
Allocates memory for AVDetectionBBoxHeader, plus an array of.
Definition: detection_bbox.c:52
DNN_COMMON_OPTIONS
#define DNN_COMMON_OPTIONS
Definition: dnn_filter_common.h:43
DnnContext::backend_type
DNNBackendType backend_type
Definition: dnn_filter_common.h:31
init
int(* init)(AVBSFContext *ctx)
Definition: dts2pts.c:366
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
AVFifo
Definition: fifo.c:35
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:94
DnnDetectContext::label_count
int label_count
Definition: vf_dnn_detect.c:49
DAST_SUCCESS
@ DAST_SUCCESS
Definition: dnn_interface.h:49
AVDetectionBBox::w
int w
Definition: detection_bbox.h:33
DNNBackendType
DNNBackendType
Definition: dnn_interface.h:35
DnnContext::nb_outputs
uint32_t nb_outputs
Definition: dnn_filter_common.h:38
av_make_q
static AVRational av_make_q(int num, int den)
Create an AVRational.
Definition: rational.h:71
avpriv_report_missing_feature
void avpriv_report_missing_feature(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
sigmoid
static float sigmoid(float x)
Definition: vf_dnn_detect.c:90
header
static const uint8_t header[24]
Definition: sdr2.c:68
DNNData::layout
DNNLayout layout
Definition: dnn_interface.h:71
AVDetectionBBox::classify_count
uint32_t classify_count
Definition: detection_bbox.h:51
DDMT_YOLOV1V2
@ DDMT_YOLOV1V2
Definition: vf_dnn_detect.c:38
DnnDetectContext::anchors_str
char * anchors_str
Definition: vf_dnn_detect.c:57
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
dnn_detect_options
static const AVOption dnn_detect_options[]
Definition: vf_dnn_detect.c:65
internal.h
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Definition: opt.h:238
dnn_detect_flush_frame
static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
Definition: vf_dnn_detect.c:722
dnn_detect_post_proc
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:581
uninit
static void uninit(AVBSFContext *ctx)
Definition: pcm_rechunk.c:68
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
DFT_ANALYTICS_DETECT
@ DFT_ANALYTICS_DETECT
Definition: dnn_interface.h:55
check_output_nb
static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
Definition: vf_dnn_detect.c:663
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:256
AVFilterPad::name
const char * name
Pad name.
Definition: internal.h:39
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:159
dnn_detect_post_proc_yolo
static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:336
DnnDetectContext::confidence
float confidence
Definition: vf_dnn_detect.c:46
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
AVFilter
Filter definition.
Definition: avfilter.h:166
ret
ret
Definition: filter_design.txt:187
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:96
frame
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
Definition: filter_design.txt:264
AVDetectionBBox::h
int h
Definition: detection_bbox.h:34
av_fifo_alloc2
AVFifo * av_fifo_alloc2(size_t nb_elems, size_t elem_size, unsigned int flags)
Allocate and initialize an AVFifo with a given element size.
Definition: fifo.c:47
AVDetectionBBox::detect_confidence
AVRational detect_confidence
Definition: detection_bbox.h:42
av_dynarray_add_nofree
int av_dynarray_add_nofree(void *tab_ptr, int *nb_ptr, void *elem)
Add an element to a dynamic array.
Definition: mem.c:315
DDMT_SSD
@ DDMT_SSD
Definition: vf_dnn_detect.c:37
status
ov_status_e status
Definition: dnn_backend_openvino.c:121
DNNDetectionModelType
DNNDetectionModelType
Definition: vf_dnn_detect.c:36
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Definition: opt.h:235
AVDetectionBBox::x
int x
Distance in pixels from the left/top edge of the frame, together with width and height,...
Definition: detection_bbox.h:31
DnnDetectContext::nb_anchor
int nb_anchor
Definition: vf_dnn_detect.c:59
AV_PIX_FMT_YUV444P
@ AV_PIX_FMT_YUV444P
planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
Definition: pixfmt.h:78
AVFilterContext
An instance of a filter.
Definition: avfilter.h:407
DNNData::dims
int dims[4]
Definition: dnn_interface.h:67
av_strdup
char * av_strdup(const char *s)
Duplicate a string.
Definition: mem.c:272
AVMEDIA_TYPE_VIDEO
@ AVMEDIA_TYPE_VIDEO
Definition: avutil.h:201
AV_PIX_FMT_YUV422P
@ AV_PIX_FMT_YUV422P
planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:77
mem.h
OFFSET
#define OFFSET(x)
Definition: vf_dnn_detect.c:62
dnn_detect_fill_side_data
static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:287
dnn_get_height_idx_by_layout
static int dnn_get_height_idx_by_layout(DNNLayout layout)
Definition: dnn_interface.h:142
AVFrameSideData
Structure to hold side data for an AVFrame.
Definition: frame.h:250
DnnDetectContext::cell_w
int cell_w
Definition: vf_dnn_detect.c:51
DnnDetectContext::anchors
float * anchors
Definition: vf_dnn_detect.c:58
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: internal.h:183
ff_dnn_init
int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx)
Definition: dnn_filter_common.c:55
dnn_detect_get_label_id
static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
Definition: vf_dnn_detect.c:98
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
AV_PIX_FMT_YUV411P
@ AV_PIX_FMT_YUV411P
planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
Definition: pixfmt.h:80
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:79
av_strlcpy
size_t av_strlcpy(char *dst, const char *src, size_t size)
Copy the string src to dst, but no more than size - 1 bytes, and null-terminate dst.
Definition: avstring.c:85
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
av_fifo_freep2
void av_fifo_freep2(AVFifo **f)
Free an AVFifo and reset pointer to NULL.
Definition: fifo.c:286
ff_dnn_uninit
void ff_dnn_uninit(DnnContext *ctx)
Definition: dnn_filter_common.c:172
AVDetectionBBox
Definition: detection_bbox.h:26
h
h
Definition: vp9dsp_template.c:2038
dnn_detect_post_proc_tf
static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:506
ff_dnn_execute_model
int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame)
Definition: dnn_filter_common.c:135
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Definition: opt.h:239
dnn_detect_parse_anchors
static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
Definition: vf_dnn_detect.c:111
DAST_NOT_READY
@ DAST_NOT_READY
Definition: dnn_interface.h:48
int
int
Definition: ffmpeg_filter.c:424
DNNAsyncStatusType
DNNAsyncStatusType
Definition: dnn_interface.h:45
AV_OPT_TYPE_CONST
@ AV_OPT_TYPE_CONST
Definition: opt.h:244
snprintf
#define snprintf
Definition: snprintf.h:34
OFFSET2
#define OFFSET2(x)
Definition: vf_dnn_detect.c:63
detection_bbox.h
AV_FIFO_FLAG_AUTO_GROW
#define AV_FIFO_FLAG_AUTO_GROW
Automatically resize the FIFO on writes, so that the data fits.
Definition: fifo.h:63
AV_FRAME_DATA_DETECTION_BBOXES
@ AV_FRAME_DATA_DETECTION_BBOXES
Bounding boxes for object detection and classification, as described by AVDetectionBBoxHeader.
Definition: frame.h:194