FFmpeg
vf_dnn_detect.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /**
20  * @file
21  * implementing an object detecting filter using deep learning networks.
22  */
23 
24 #include "libavutil/file_open.h"
25 #include "libavutil/opt.h"
26 #include "filters.h"
27 #include "dnn_filter_common.h"
28 #include "internal.h"
29 #include "video.h"
30 #include "libavutil/time.h"
31 #include "libavutil/avstring.h"
33 #include "libavutil/fifo.h"
34 
35 typedef enum {
41 
42 typedef struct DnnDetectContext {
43  const AVClass *class;
45  float confidence;
47  char **labels;
50  int cell_w;
51  int cell_h;
56  char *anchors_str;
57  float *anchors;
58  int nb_anchor;
60 
61 #define OFFSET(x) offsetof(DnnDetectContext, dnnctx.x)
62 #define OFFSET2(x) offsetof(DnnDetectContext, x)
63 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
64 static const AVOption dnn_detect_options[] = {
65  { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = DNN_OV }, INT_MIN, INT_MAX, FLAGS, .unit = "backend" },
66 #if (CONFIG_LIBTENSORFLOW == 1)
67  { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_TF }, 0, 0, FLAGS, .unit = "backend" },
68 #endif
69 #if (CONFIG_LIBOPENVINO == 1)
70  { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = DNN_OV }, 0, 0, FLAGS, .unit = "backend" },
71 #endif
73  { "confidence", "threshold of confidence", OFFSET2(confidence), AV_OPT_TYPE_FLOAT, { .dbl = 0.5 }, 0, 1, FLAGS},
74  { "labels", "path to labels file", OFFSET2(labels_filename), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
75  { "model_type", "DNN detection model type", OFFSET2(model_type), AV_OPT_TYPE_INT, { .i64 = DDMT_SSD }, INT_MIN, INT_MAX, FLAGS, .unit = "model_type" },
76  { "ssd", "output shape [1, 1, N, 7]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_SSD }, 0, 0, FLAGS, .unit = "model_type" },
77  { "yolo", "output shape [1, N*Cx*Cy*DetectionBox]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV1V2 }, 0, 0, FLAGS, .unit = "model_type" },
78  { "yolov3", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV3 }, 0, 0, FLAGS, .unit = "model_type" },
79  { "yolov4", "outputs shape [1, N*D, Cx, Cy]", 0, AV_OPT_TYPE_CONST, { .i64 = DDMT_YOLOV4 }, 0, 0, FLAGS, .unit = "model_type" },
80  { "cell_w", "cell width", OFFSET2(cell_w), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
81  { "cell_h", "cell height", OFFSET2(cell_h), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
82  { "nb_classes", "The number of class", OFFSET2(nb_classes), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INTMAX_MAX, FLAGS },
83  { "anchors", "anchors, splited by '&'", OFFSET2(anchors_str), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, FLAGS },
84  { NULL }
85 };
86 
87 AVFILTER_DEFINE_CLASS(dnn_detect);
88 
89 static inline float sigmoid(float x) {
90  return 1.f / (1.f + exp(-x));
91 }
92 
93 static inline float linear(float x) {
94  return x;
95 }
96 
97 static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
98 {
99  float max_prob = 0;
100  int label_id = 0;
101  for (int i = 0; i < nb_classes; i++) {
102  if (label_data[i * cell_size] > max_prob) {
103  max_prob = label_data[i * cell_size];
104  label_id = i;
105  }
106  }
107  return label_id;
108 }
109 
110 static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
111 {
112  char *saveptr = NULL, *token;
113  float *anchors_buf;
114  int nb_anchor = 0, i = 0;
115  while(anchors_str[i] != '\0') {
116  if(anchors_str[i] == '&')
117  nb_anchor++;
118  i++;
119  }
120  nb_anchor++;
121  anchors_buf = av_mallocz(nb_anchor * sizeof(**anchors));
122  if (!anchors_buf) {
123  return 0;
124  }
125  for (int i = 0; i < nb_anchor; i++) {
126  token = av_strtok(anchors_str, "&", &saveptr);
127  if (!token) {
128  av_freep(&anchors_buf);
129  return 0;
130  }
131  anchors_buf[i] = strtof(token, NULL);
132  anchors_str = NULL;
133  }
134  *anchors = anchors_buf;
135  return nb_anchor;
136 }
137 
138 /* Calculate Intersection Over Union */
139 static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
140 {
141  float overlapping_width = FFMIN(bbox1->x + bbox1->w, bbox2->x + bbox2->w) - FFMAX(bbox1->x, bbox2->x);
142  float overlapping_height = FFMIN(bbox1->y + bbox1->h, bbox2->y + bbox2->h) - FFMAX(bbox1->y, bbox2->y);
143  float intersection_area =
144  (overlapping_width < 0 || overlapping_height < 0) ? 0 : overlapping_height * overlapping_width;
145  float union_area = bbox1->w * bbox1->h + bbox2->w * bbox2->h - intersection_area;
146  return intersection_area / union_area;
147 }
148 
149 static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index,
151 {
153  float conf_threshold = ctx->confidence;
154  int detection_boxes, box_size;
155  int cell_w = 0, cell_h = 0, scale_w = 0, scale_h = 0;
156  int nb_classes = ctx->nb_classes;
157  float *output_data = output[output_index].data;
158  float *anchors = ctx->anchors;
159  AVDetectionBBox *bbox;
160  float (*post_process_raw_data)(float x) = linear;
161  int is_NHWC = 0;
162 
163  if (ctx->model_type == DDMT_YOLOV1V2) {
164  cell_w = ctx->cell_w;
165  cell_h = ctx->cell_h;
166  scale_w = cell_w;
167  scale_h = cell_h;
168  } else {
169  if (output[output_index].dims[2] != output[output_index].dims[3] &&
170  output[output_index].dims[2] == output[output_index].dims[1]) {
171  is_NHWC = 1;
172  cell_w = output[output_index].dims[2];
173  cell_h = output[output_index].dims[1];
174  } else {
175  cell_w = output[output_index].dims[3];
176  cell_h = output[output_index].dims[2];
177  }
178  scale_w = ctx->scale_width;
179  scale_h = ctx->scale_height;
180  }
181  box_size = nb_classes + 5;
182 
183  switch (ctx->model_type) {
184  case DDMT_YOLOV1V2:
185  case DDMT_YOLOV3:
186  post_process_raw_data = linear;
187  break;
188  case DDMT_YOLOV4:
189  post_process_raw_data = sigmoid;
190  break;
191  }
192 
193  if (!cell_h || !cell_w) {
194  av_log(filter_ctx, AV_LOG_ERROR, "cell_w and cell_h are detected\n");
195  return AVERROR(EINVAL);
196  }
197 
198  if (!nb_classes) {
199  av_log(filter_ctx, AV_LOG_ERROR, "nb_classes is not set\n");
200  return AVERROR(EINVAL);
201  }
202 
203  if (!anchors) {
204  av_log(filter_ctx, AV_LOG_ERROR, "anchors is not set\n");
205  return AVERROR(EINVAL);
206  }
207 
208  if (output[output_index].dims[1] * output[output_index].dims[2] *
209  output[output_index].dims[3] % (box_size * cell_w * cell_h)) {
210  av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n");
211  return AVERROR(EINVAL);
212  }
213  detection_boxes = output[output_index].dims[1] *
214  output[output_index].dims[2] *
215  output[output_index].dims[3] / box_size / cell_w / cell_h;
216 
217  anchors = anchors + (detection_boxes * output_index * 2);
218  /**
219  * find all candidate bbox
220  * yolo output can be reshaped to [B, N*D, Cx, Cy]
221  * Detection box 'D' has format [`x`, `y`, `h`, `w`, `box_score`, `class_no_1`, ...,]
222  **/
223  for (int box_id = 0; box_id < detection_boxes; box_id++) {
224  for (int cx = 0; cx < cell_w; cx++)
225  for (int cy = 0; cy < cell_h; cy++) {
226  float x, y, w, h, conf;
227  float *detection_boxes_data;
228  int label_id;
229 
230  if (is_NHWC) {
231  detection_boxes_data = output_data +
232  ((cy * cell_w + cx) * detection_boxes + box_id) * box_size;
233  conf = post_process_raw_data(detection_boxes_data[4]);
234  } else {
235  detection_boxes_data = output_data + box_id * box_size * cell_w * cell_h;
236  conf = post_process_raw_data(
237  detection_boxes_data[cy * cell_w + cx + 4 * cell_w * cell_h]);
238  }
239 
240  if (is_NHWC) {
241  x = post_process_raw_data(detection_boxes_data[0]);
242  y = post_process_raw_data(detection_boxes_data[1]);
243  w = detection_boxes_data[2];
244  h = detection_boxes_data[3];
245  label_id = dnn_detect_get_label_id(ctx->nb_classes, 1, detection_boxes_data + 5);
246  conf = conf * post_process_raw_data(detection_boxes_data[label_id + 5]);
247  } else {
248  x = post_process_raw_data(detection_boxes_data[cy * cell_w + cx]);
249  y = post_process_raw_data(detection_boxes_data[cy * cell_w + cx + cell_w * cell_h]);
250  w = detection_boxes_data[cy * cell_w + cx + 2 * cell_w * cell_h];
251  h = detection_boxes_data[cy * cell_w + cx + 3 * cell_w * cell_h];
252  label_id = dnn_detect_get_label_id(ctx->nb_classes, cell_w * cell_h,
253  detection_boxes_data + cy * cell_w + cx + 5 * cell_w * cell_h);
254  conf = conf * post_process_raw_data(
255  detection_boxes_data[cy * cell_w + cx + (label_id + 5) * cell_w * cell_h]);
256  }
257  if (conf < conf_threshold) {
258  continue;
259  }
260 
261  bbox = av_mallocz(sizeof(*bbox));
262  if (!bbox)
263  return AVERROR(ENOMEM);
264 
265  bbox->w = exp(w) * anchors[box_id * 2] * frame->width / scale_w;
266  bbox->h = exp(h) * anchors[box_id * 2 + 1] * frame->height / scale_h;
267  bbox->x = (cx + x) / cell_w * frame->width - bbox->w / 2;
268  bbox->y = (cy + y) / cell_h * frame->height - bbox->h / 2;
269  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
270  if (ctx->labels && label_id < ctx->label_count) {
271  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
272  } else {
273  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
274  }
275 
276  if (av_fifo_write(ctx->bboxes_fifo, &bbox, 1) < 0) {
277  av_freep(&bbox);
278  return AVERROR(ENOMEM);
279  }
280  bbox = NULL;
281  }
282  }
283  return 0;
284 }
285 
287 {
289  float conf_threshold = ctx->confidence;
290  AVDetectionBBox *bbox;
291  int nb_bboxes = 0;
293  if (av_fifo_can_read(ctx->bboxes_fifo) == 0) {
294  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
295  return 0;
296  }
297 
298  /* remove overlap bboxes */
299  for (int i = 0; i < av_fifo_can_read(ctx->bboxes_fifo); i++){
300  av_fifo_peek(ctx->bboxes_fifo, &bbox, 1, i);
301  for (int j = 0; j < av_fifo_can_read(ctx->bboxes_fifo); j++) {
302  AVDetectionBBox *overlap_bbox;
303  av_fifo_peek(ctx->bboxes_fifo, &overlap_bbox, 1, j);
304  if (!strcmp(bbox->detect_label, overlap_bbox->detect_label) &&
305  av_cmp_q(bbox->detect_confidence, overlap_bbox->detect_confidence) < 0 &&
306  dnn_detect_IOU(bbox, overlap_bbox) >= conf_threshold) {
307  bbox->classify_count = -1; // bad result
308  nb_bboxes++;
309  break;
310  }
311  }
312  }
313  nb_bboxes = av_fifo_can_read(ctx->bboxes_fifo) - nb_bboxes;
315  if (!header) {
316  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
317  return -1;
318  }
319  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
320 
321  while(av_fifo_can_read(ctx->bboxes_fifo)) {
322  AVDetectionBBox *candidate_bbox;
323  av_fifo_read(ctx->bboxes_fifo, &candidate_bbox, 1);
324 
325  if (nb_bboxes > 0 && candidate_bbox->classify_count != -1) {
326  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
327  memcpy(bbox, candidate_bbox, sizeof(*bbox));
328  nb_bboxes--;
329  }
330  av_freep(&candidate_bbox);
331  }
332  return 0;
333 }
334 
336 {
337  int ret = 0;
339  if (ret < 0)
340  return ret;
342  if (ret < 0)
343  return ret;
344  return 0;
345 }
346 
348  AVFilterContext *filter_ctx, int nb_outputs)
349 {
350  int ret = 0;
351  for (int i = 0; i < nb_outputs; i++) {
353  if (ret < 0)
354  return ret;
355  }
357  if (ret < 0)
358  return ret;
359  return 0;
360 }
361 
362 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
364 {
366  float conf_threshold = ctx->confidence;
367  int proposal_count = 0;
368  int detect_size = 0;
369  float *detections = NULL, *labels = NULL;
370  int nb_bboxes = 0;
372  AVDetectionBBox *bbox;
373  int scale_w = ctx->scale_width;
374  int scale_h = ctx->scale_height;
375 
376  if (nb_outputs == 1 && output->dims[3] == 7) {
377  proposal_count = output->dims[2];
378  detect_size = output->dims[3];
379  detections = output->data;
380  } else if (nb_outputs == 2 && output[0].dims[3] == 5) {
381  proposal_count = output[0].dims[2];
382  detect_size = output[0].dims[3];
383  detections = output[0].data;
384  labels = output[1].data;
385  } else if (nb_outputs == 2 && output[1].dims[3] == 5) {
386  proposal_count = output[1].dims[2];
387  detect_size = output[1].dims[3];
388  detections = output[1].data;
389  labels = output[0].data;
390  } else {
391  av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
392  return AVERROR(EINVAL);
393  }
394 
395  if (proposal_count == 0)
396  return 0;
397 
398  for (int i = 0; i < proposal_count; ++i) {
399  float conf;
400  if (nb_outputs == 1)
401  conf = detections[i * detect_size + 2];
402  else
403  conf = detections[i * detect_size + 4];
404  if (conf < conf_threshold) {
405  continue;
406  }
407  nb_bboxes++;
408  }
409 
410  if (nb_bboxes == 0) {
411  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
412  return 0;
413  }
414 
416  if (!header) {
417  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
418  return -1;
419  }
420 
421  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
422 
423  for (int i = 0; i < proposal_count; ++i) {
424  int av_unused image_id = (int)detections[i * detect_size + 0];
425  int label_id;
426  float conf, x0, y0, x1, y1;
427 
428  if (nb_outputs == 1) {
429  label_id = (int)detections[i * detect_size + 1];
430  conf = detections[i * detect_size + 2];
431  x0 = detections[i * detect_size + 3];
432  y0 = detections[i * detect_size + 4];
433  x1 = detections[i * detect_size + 5];
434  y1 = detections[i * detect_size + 6];
435  } else {
436  label_id = (int)labels[i];
437  x0 = detections[i * detect_size] / scale_w;
438  y0 = detections[i * detect_size + 1] / scale_h;
439  x1 = detections[i * detect_size + 2] / scale_w;
440  y1 = detections[i * detect_size + 3] / scale_h;
441  conf = detections[i * detect_size + 4];
442  }
443 
444  if (conf < conf_threshold) {
445  continue;
446  }
447 
448  bbox = av_get_detection_bbox(header, header->nb_bboxes - nb_bboxes);
449  bbox->x = (int)(x0 * frame->width);
450  bbox->w = (int)(x1 * frame->width) - bbox->x;
451  bbox->y = (int)(y0 * frame->height);
452  bbox->h = (int)(y1 * frame->height) - bbox->y;
453 
454  bbox->detect_confidence = av_make_q((int)(conf * 10000), 10000);
455  bbox->classify_count = 0;
456 
457  if (ctx->labels && label_id < ctx->label_count) {
458  av_strlcpy(bbox->detect_label, ctx->labels[label_id], sizeof(bbox->detect_label));
459  } else {
460  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", label_id);
461  }
462 
463  nb_bboxes--;
464  if (nb_bboxes == 0) {
465  break;
466  }
467  }
468  return 0;
469 }
470 
471 static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs,
473 {
474  AVFrameSideData *sd;
476  int ret = 0;
477 
479  if (sd) {
480  av_log(filter_ctx, AV_LOG_ERROR, "already have bounding boxes in side data.\n");
481  return -1;
482  }
483 
484  switch (ctx->model_type) {
485  case DDMT_SSD:
487  if (ret < 0)
488  return ret;
489  break;
490  case DDMT_YOLOV1V2:
492  if (ret < 0)
493  return ret;
494  break;
495  case DDMT_YOLOV3:
496  case DDMT_YOLOV4:
498  if (ret < 0)
499  return ret;
500  break;
501  }
502  return 0;
503 }
504 
506 {
508  int proposal_count;
509  float conf_threshold = ctx->confidence;
510  float *conf, *position, *label_id, x0, y0, x1, y1;
511  int nb_bboxes = 0;
512  AVFrameSideData *sd;
513  AVDetectionBBox *bbox;
515 
516  proposal_count = *(float *)(output[0].data);
517  conf = output[1].data;
518  position = output[3].data;
519  label_id = output[2].data;
520 
522  if (sd) {
523  av_log(filter_ctx, AV_LOG_ERROR, "already have dnn bounding boxes in side data.\n");
524  return -1;
525  }
526 
527  for (int i = 0; i < proposal_count; ++i) {
528  if (conf[i] < conf_threshold)
529  continue;
530  nb_bboxes++;
531  }
532 
533  if (nb_bboxes == 0) {
534  av_log(filter_ctx, AV_LOG_VERBOSE, "nothing detected in this frame.\n");
535  return 0;
536  }
537 
539  if (!header) {
540  av_log(filter_ctx, AV_LOG_ERROR, "failed to create side data with %d bounding boxes\n", nb_bboxes);
541  return -1;
542  }
543 
544  av_strlcpy(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
545 
546  for (int i = 0; i < proposal_count; ++i) {
547  y0 = position[i * 4];
548  x0 = position[i * 4 + 1];
549  y1 = position[i * 4 + 2];
550  x1 = position[i * 4 + 3];
551 
552  bbox = av_get_detection_bbox(header, i);
553 
554  if (conf[i] < conf_threshold) {
555  continue;
556  }
557 
558  bbox->x = (int)(x0 * frame->width);
559  bbox->w = (int)(x1 * frame->width) - bbox->x;
560  bbox->y = (int)(y0 * frame->height);
561  bbox->h = (int)(y1 * frame->height) - bbox->y;
562 
563  bbox->detect_confidence = av_make_q((int)(conf[i] * 10000), 10000);
564  bbox->classify_count = 0;
565 
566  if (ctx->labels && label_id[i] < ctx->label_count) {
567  av_strlcpy(bbox->detect_label, ctx->labels[(int)label_id[i]], sizeof(bbox->detect_label));
568  } else {
569  snprintf(bbox->detect_label, sizeof(bbox->detect_label), "%d", (int)label_id[i]);
570  }
571 
572  nb_bboxes--;
573  if (nb_bboxes == 0) {
574  break;
575  }
576  }
577  return 0;
578 }
579 
581 {
583  DnnContext *dnn_ctx = &ctx->dnnctx;
584  switch (dnn_ctx->backend_type) {
585  case DNN_OV:
587  case DNN_TF:
589  default:
590  avpriv_report_missing_feature(filter_ctx, "Current dnn backend does not support detect filter\n");
591  return AVERROR(EINVAL);
592  }
593 }
594 
596 {
597  for (int i = 0; i < ctx->label_count; i++) {
598  av_freep(&ctx->labels[i]);
599  }
600  ctx->label_count = 0;
601  av_freep(&ctx->labels);
602 }
603 
605 {
606  int line_len;
607  FILE *file;
608  DnnDetectContext *ctx = context->priv;
609 
610  file = avpriv_fopen_utf8(ctx->labels_filename, "r");
611  if (!file){
612  av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
613  return AVERROR(EINVAL);
614  }
615 
616  while (!feof(file)) {
617  char *label;
618  char buf[256];
619  if (!fgets(buf, 256, file)) {
620  break;
621  }
622 
623  line_len = strlen(buf);
624  while (line_len) {
625  int i = line_len - 1;
626  if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
627  buf[i] = '\0';
628  line_len--;
629  } else {
630  break;
631  }
632  }
633 
634  if (line_len == 0) // empty line
635  continue;
636 
637  if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
638  av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
639  fclose(file);
640  return AVERROR(EINVAL);
641  }
642 
643  label = av_strdup(buf);
644  if (!label) {
645  av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
646  fclose(file);
647  return AVERROR(ENOMEM);
648  }
649 
650  if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
651  av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
652  fclose(file);
653  av_freep(&label);
654  return AVERROR(ENOMEM);
655  }
656  }
657 
658  fclose(file);
659  return 0;
660 }
661 
662 static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
663 {
664  switch(backend_type) {
665  case DNN_TF:
666  if (output_nb != 4) {
667  av_log(ctx, AV_LOG_ERROR, "Only support tensorflow detect model with 4 outputs, \
668  but get %d instead\n", output_nb);
669  return AVERROR(EINVAL);
670  }
671  return 0;
672  case DNN_OV:
673  return 0;
674  default:
675  avpriv_report_missing_feature(ctx, "Dnn detect filter does not support current backend\n");
676  return AVERROR(EINVAL);
677  }
678  return 0;
679 }
680 
682 {
683  DnnDetectContext *ctx = context->priv;
684  DnnContext *dnn_ctx = &ctx->dnnctx;
685  int ret;
686 
688  if (ret < 0)
689  return ret;
690  ret = check_output_nb(ctx, dnn_ctx->backend_type, dnn_ctx->nb_outputs);
691  if (ret < 0)
692  return ret;
693  ctx->bboxes_fifo = av_fifo_alloc2(1, sizeof(AVDetectionBBox *), AV_FIFO_FLAG_AUTO_GROW);
694  if (!ctx->bboxes_fifo)
695  return AVERROR(ENOMEM);
697 
698  if (ctx->labels_filename) {
700  }
701  if (ctx->anchors_str) {
702  ret = dnn_detect_parse_anchors(ctx->anchors_str, &ctx->anchors);
703  if (!ctx->anchors) {
704  av_log(context, AV_LOG_ERROR, "failed to parse anchors_str\n");
705  return AVERROR(EINVAL);
706  }
707  ctx->nb_anchor = ret;
708  }
709  return 0;
710 }
711 
712 static const enum AVPixelFormat pix_fmts[] = {
719 };
720 
721 static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
722 {
723  DnnDetectContext *ctx = outlink->src->priv;
724  int ret;
725  DNNAsyncStatusType async_state;
726 
727  ret = ff_dnn_flush(&ctx->dnnctx);
728  if (ret != 0) {
729  return -1;
730  }
731 
732  do {
733  AVFrame *in_frame = NULL;
734  AVFrame *out_frame = NULL;
735  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
736  if (async_state == DAST_SUCCESS) {
737  ret = ff_filter_frame(outlink, in_frame);
738  if (ret < 0)
739  return ret;
740  if (out_pts)
741  *out_pts = in_frame->pts + pts;
742  }
743  av_usleep(5000);
744  } while (async_state >= DAST_NOT_READY);
745 
746  return 0;
747 }
748 
750 {
751  AVFilterLink *inlink = filter_ctx->inputs[0];
752  AVFilterLink *outlink = filter_ctx->outputs[0];
754  AVFrame *in = NULL;
755  int64_t pts;
756  int ret, status;
757  int got_frame = 0;
758  int async_state;
759 
761 
762  do {
763  // drain all input frames
765  if (ret < 0)
766  return ret;
767  if (ret > 0) {
768  if (ff_dnn_execute_model(&ctx->dnnctx, in, NULL) != 0) {
769  return AVERROR(EIO);
770  }
771  }
772  } while (ret > 0);
773 
774  // drain all processed frames
775  do {
776  AVFrame *in_frame = NULL;
777  AVFrame *out_frame = NULL;
778  async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
779  if (async_state == DAST_SUCCESS) {
780  ret = ff_filter_frame(outlink, in_frame);
781  if (ret < 0)
782  return ret;
783  got_frame = 1;
784  }
785  } while (async_state == DAST_SUCCESS);
786 
787  // if frame got, schedule to next filter
788  if (got_frame)
789  return 0;
790 
792  if (status == AVERROR_EOF) {
793  int64_t out_pts = pts;
794  ret = dnn_detect_flush_frame(outlink, pts, &out_pts);
795  ff_outlink_set_status(outlink, status, out_pts);
796  return ret;
797  }
798  }
799 
801 
802  return 0;
803 }
804 
806 {
807  DnnDetectContext *ctx = context->priv;
808  AVDetectionBBox *bbox;
809  ff_dnn_uninit(&ctx->dnnctx);
810  while(av_fifo_can_read(ctx->bboxes_fifo)) {
811  av_fifo_read(ctx->bboxes_fifo, &bbox, 1);
812  av_freep(&bbox);
813  }
814  av_fifo_freep2(&ctx->bboxes_fifo);
815  av_freep(&ctx->anchors);
817 }
818 
820 {
822  DnnDetectContext *ctx = context->priv;
823  DNNData model_input;
824  int ret, width_idx, height_idx;
825 
826  ret = ff_dnn_get_input(&ctx->dnnctx, &model_input);
827  if (ret != 0) {
828  av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
829  return ret;
830  }
831  width_idx = dnn_get_width_idx_by_layout(model_input.layout);
832  height_idx = dnn_get_height_idx_by_layout(model_input.layout);
833  ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w :
834  model_input.dims[width_idx];
835  ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h :
836  model_input.dims[height_idx];
837 
838  return 0;
839 }
840 
841 static const AVFilterPad dnn_detect_inputs[] = {
842  {
843  .name = "default",
844  .type = AVMEDIA_TYPE_VIDEO,
845  .config_props = config_input,
846  },
847 };
848 
850  .name = "dnn_detect",
851  .description = NULL_IF_CONFIG_SMALL("Apply DNN detect filter to the input."),
852  .priv_size = sizeof(DnnDetectContext),
858  .priv_class = &dnn_detect_class,
859  .activate = dnn_detect_activate,
860 };
pix_fmts
static enum AVPixelFormat pix_fmts[]
Definition: vf_dnn_detect.c:712
DnnDetectContext::nb_classes
int nb_classes
Definition: vf_dnn_detect.c:52
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
dnn_detect_parse_yolo_output
static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int output_index, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:149
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
av_frame_get_side_data
AVFrameSideData * av_frame_get_side_data(const AVFrame *frame, enum AVFrameSideDataType type)
Definition: frame.c:858
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1018
AVERROR_EOF
#define AVERROR_EOF
End of file.
Definition: error.h:57
FILTER_PIXFMTS_ARRAY
#define FILTER_PIXFMTS_ARRAY(array)
Definition: internal.h:162
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
av_unused
#define av_unused
Definition: attributes.h:131
av_fifo_peek
int av_fifo_peek(const AVFifo *f, void *buf, size_t nb_elems, size_t offset)
Read data from a FIFO without modifying FIFO state.
Definition: fifo.c:255
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:375
AVFrame::pts
int64_t pts
Presentation timestamp in time_base units (time when frame should be shown to user).
Definition: frame.h:487
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(dnn_detect)
w
uint8_t w
Definition: llviddspenc.c:38
read_detect_label_file
static int read_detect_label_file(AVFilterContext *context)
Definition: vf_dnn_detect.c:604
AVOption
AVOption.
Definition: opt.h:346
data
const char data[16]
Definition: mxf.c:148
dnn_detect_post_proc_ssd
static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:362
dnn_detect_init
static av_cold int dnn_detect_init(AVFilterContext *context)
Definition: vf_dnn_detect.c:681
output_data
static int output_data(MLPDecodeContext *m, unsigned int substr, AVFrame *frame, int *got_frame_ptr)
Write the audio data into the output buffer.
Definition: mlpdec.c:1108
AV_LOG_VERBOSE
#define AV_LOG_VERBOSE
Detailed information.
Definition: log.h:196
dnn_detect_inputs
static const AVFilterPad dnn_detect_inputs[]
Definition: vf_dnn_detect.c:841
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:170
dnn_filter_common.h
AVDetectionBBox::y
int y
Definition: detection_bbox.h:32
video.h
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:199
ff_inlink_consume_frame
int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)
Take a frame from the link's FIFO and update the link's stats.
Definition: avfilter.c:1445
fifo.h
AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
#define AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE
Definition: detection_bbox.h:36
dnn_get_width_idx_by_layout
static int dnn_get_width_idx_by_layout(DNNLayout layout)
Definition: dnn_interface.h:137
AVDetectionBBox::detect_label
char detect_label[AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE]
Detect result with confidence.
Definition: detection_bbox.h:41
AVFilterContext::priv
void * priv
private data for use by the filter
Definition: avfilter.h:422
av_fifo_write
int av_fifo_write(AVFifo *f, const void *buf, size_t nb_elems)
Write data into a FIFO.
Definition: fifo.c:188
DnnContext
Definition: dnn_filter_common.h:29
dnn_detect_IOU
static float dnn_detect_IOU(AVDetectionBBox *bbox1, AVDetectionBBox *bbox2)
Definition: vf_dnn_detect.c:139
filter_ctx
static FilteringContext * filter_ctx
Definition: transcode.c:51
dnn_detect_uninit
static av_cold void dnn_detect_uninit(AVFilterContext *context)
Definition: vf_dnn_detect.c:805
DnnDetectContext
Definition: vf_dnn_detect.c:42
pts
static int64_t pts
Definition: transcode_aac.c:643
DnnDetectContext::model_type
DNNDetectionModelType model_type
Definition: vf_dnn_detect.c:49
AVFilterPad
A filter pad used for either input or output.
Definition: internal.h:33
av_get_detection_bbox
static av_always_inline AVDetectionBBox * av_get_detection_bbox(const AVDetectionBBoxHeader *header, unsigned int idx)
Definition: detection_bbox.h:84
DnnDetectContext::scale_height
int scale_height
Definition: vf_dnn_detect.c:55
dnn_detect_post_proc_ov
static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outputs, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:471
DNN_TF
@ DNN_TF
Definition: dnn_interface.h:35
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
av_fifo_read
int av_fifo_read(AVFifo *f, void *buf, size_t nb_elems)
Read data from a FIFO.
Definition: fifo.c:240
ff_video_default_filterpad
const AVFilterPad ff_video_default_filterpad[1]
An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_VIDEO.
Definition: video.c:37
DnnDetectContext::bboxes_fifo
AVFifo * bboxes_fifo
Definition: vf_dnn_detect.c:53
float
float
Definition: af_crystalizer.c:121
ff_outlink_set_status
static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)
Set the status field of a link from the source filter.
Definition: filters.h:189
ff_dnn_set_detect_post_proc
int ff_dnn_set_detect_post_proc(DnnContext *ctx, DetectPostProc post_proc)
Definition: dnn_filter_common.c:109
av_strtok
char * av_strtok(char *s, const char *delim, char **saveptr)
Split the string into several tokens which can be accessed by successive calls to av_strtok().
Definition: avstring.c:178
free_detect_labels
static void free_detect_labels(DnnDetectContext *ctx)
Definition: vf_dnn_detect.c:595
DNNData
Definition: dnn_interface.h:65
dnn_detect_post_proc_yolov3
static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx, int nb_outputs)
Definition: vf_dnn_detect.c:347
filters.h
ff_dnn_get_result
DNNAsyncStatusType ff_dnn_get_result(DnnContext *ctx, AVFrame **in_frame, AVFrame **out_frame)
Definition: dnn_filter_common.c:161
ctx
AVFormatContext * ctx
Definition: movenc.c:48
config_input
static int config_input(AVFilterLink *inlink)
Definition: vf_dnn_detect.c:819
linear
static float linear(float x)
Definition: vf_dnn_detect.c:93
ff_vf_dnn_detect
const AVFilter ff_vf_dnn_detect
Definition: vf_dnn_detect.c:849
DnnDetectContext::scale_width
int scale_width
Definition: vf_dnn_detect.c:54
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:73
av_usleep
int av_usleep(unsigned usec)
Sleep for a period of time.
Definition: time.c:84
AV_PIX_FMT_GRAYF32
#define AV_PIX_FMT_GRAYF32
Definition: pixfmt.h:511
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: internal.h:182
file_open.h
ff_dnn_get_input
int ff_dnn_get_input(DnnContext *ctx, DNNData *input)
Definition: dnn_filter_common.c:121
DNN_OV
@ DNN_OV
Definition: dnn_interface.h:35
context
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are in without and describe what they for example set the foo of the bar offset is the offset of the field in your context
Definition: writing_filters.txt:91
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:66
NULL
#define NULL
Definition: coverity.c:32
AVDetectionBBoxHeader
Definition: detection_bbox.h:56
DnnDetectContext::dnnctx
DnnContext dnnctx
Definition: vf_dnn_detect.c:44
DnnDetectContext::cell_h
int cell_h
Definition: vf_dnn_detect.c:51
dnn_detect_activate
static int dnn_detect_activate(AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:749
DnnDetectContext::labels_filename
char * labels_filename
Definition: vf_dnn_detect.c:46
av_fifo_can_read
size_t av_fifo_can_read(const AVFifo *f)
Definition: fifo.c:87
DnnDetectContext::labels
char ** labels
Definition: vf_dnn_detect.c:47
DDMT_YOLOV3
@ DDMT_YOLOV3
Definition: vf_dnn_detect.c:38
time.h
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:81
exp
int8_t exp
Definition: eval.c:74
ff_dnn_flush
int ff_dnn_flush(DnnContext *ctx)
Definition: dnn_filter_common.c:166
ff_inlink_acknowledge_status
int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)
Test and acknowledge the change of status on the link.
Definition: avfilter.c:1392
FLAGS
#define FLAGS
Definition: vf_dnn_detect.c:63
DDMT_YOLOV4
@ DDMT_YOLOV4
Definition: vf_dnn_detect.c:39
av_detection_bbox_create_side_data
AVDetectionBBoxHeader * av_detection_bbox_create_side_data(AVFrame *frame, uint32_t nb_bboxes)
Allocates memory for AVDetectionBBoxHeader, plus an array of.
Definition: detection_bbox.c:51
DNN_COMMON_OPTIONS
#define DNN_COMMON_OPTIONS
Definition: dnn_filter_common.h:43
DnnContext::backend_type
DNNBackendType backend_type
Definition: dnn_filter_common.h:31
init
int(* init)(AVBSFContext *ctx)
Definition: dts2pts.c:365
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
AVFifo
Definition: fifo.c:35
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:106
DnnDetectContext::label_count
int label_count
Definition: vf_dnn_detect.c:48
DAST_SUCCESS
@ DAST_SUCCESS
Definition: dnn_interface.h:49
AVDetectionBBox::w
int w
Definition: detection_bbox.h:33
DNNBackendType
DNNBackendType
Definition: dnn_interface.h:35
DnnContext::nb_outputs
uint32_t nb_outputs
Definition: dnn_filter_common.h:38
av_make_q
static AVRational av_make_q(int num, int den)
Create an AVRational.
Definition: rational.h:71
avpriv_report_missing_feature
void avpriv_report_missing_feature(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
sigmoid
static float sigmoid(float x)
Definition: vf_dnn_detect.c:89
header
static const uint8_t header[24]
Definition: sdr2.c:68
DNNData::layout
DNNLayout layout
Definition: dnn_interface.h:71
AVDetectionBBox::classify_count
uint32_t classify_count
Definition: detection_bbox.h:51
DDMT_YOLOV1V2
@ DDMT_YOLOV1V2
Definition: vf_dnn_detect.c:37
DnnDetectContext::anchors_str
char * anchors_str
Definition: vf_dnn_detect.c:56
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
dnn_detect_options
static const AVOption dnn_detect_options[]
Definition: vf_dnn_detect.c:64
internal.h
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Definition: opt.h:238
dnn_detect_flush_frame
static int dnn_detect_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
Definition: vf_dnn_detect.c:721
dnn_detect_post_proc
static int dnn_detect_post_proc(AVFrame *frame, DNNData *output, uint32_t nb, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:580
uninit
static void uninit(AVBSFContext *ctx)
Definition: pcm_rechunk.c:68
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
DFT_ANALYTICS_DETECT
@ DFT_ANALYTICS_DETECT
Definition: dnn_interface.h:55
check_output_nb
static int check_output_nb(DnnDetectContext *ctx, DNNBackendType backend_type, int output_nb)
Definition: vf_dnn_detect.c:662
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:254
AVFilterPad::name
const char * name
Pad name.
Definition: internal.h:39
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:159
dnn_detect_post_proc_yolo
static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:335
DnnDetectContext::confidence
float confidence
Definition: vf_dnn_detect.c:45
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
AVFilter
Filter definition.
Definition: avfilter.h:166
ret
ret
Definition: filter_design.txt:187
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:96
frame
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame
Definition: filter_design.txt:264
AVDetectionBBox::h
int h
Definition: detection_bbox.h:34
av_fifo_alloc2
AVFifo * av_fifo_alloc2(size_t nb_elems, size_t elem_size, unsigned int flags)
Allocate and initialize an AVFifo with a given element size.
Definition: fifo.c:47
AVDetectionBBox::detect_confidence
AVRational detect_confidence
Definition: detection_bbox.h:42
av_dynarray_add_nofree
int av_dynarray_add_nofree(void *tab_ptr, int *nb_ptr, void *elem)
Add an element to a dynamic array.
Definition: mem.c:313
DDMT_SSD
@ DDMT_SSD
Definition: vf_dnn_detect.c:36
status
ov_status_e status
Definition: dnn_backend_openvino.c:120
DNNDetectionModelType
DNNDetectionModelType
Definition: vf_dnn_detect.c:35
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
AV_OPT_TYPE_INT
@ AV_OPT_TYPE_INT
Definition: opt.h:235
AVDetectionBBox::x
int x
Distance in pixels from the left/top edge of the frame, together with width and height,...
Definition: detection_bbox.h:31
DnnDetectContext::nb_anchor
int nb_anchor
Definition: vf_dnn_detect.c:58
AV_PIX_FMT_YUV444P
@ AV_PIX_FMT_YUV444P
planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
Definition: pixfmt.h:78
AVFilterContext
An instance of a filter.
Definition: avfilter.h:407
DNNData::dims
int dims[4]
Definition: dnn_interface.h:67
av_strdup
char * av_strdup(const char *s)
Duplicate a string.
Definition: mem.c:270
AVMEDIA_TYPE_VIDEO
@ AVMEDIA_TYPE_VIDEO
Definition: avutil.h:201
AV_PIX_FMT_YUV422P
@ AV_PIX_FMT_YUV422P
planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
Definition: pixfmt.h:77
OFFSET
#define OFFSET(x)
Definition: vf_dnn_detect.c:61
dnn_detect_fill_side_data
static int dnn_detect_fill_side_data(AVFrame *frame, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:286
dnn_get_height_idx_by_layout
static int dnn_get_height_idx_by_layout(DNNLayout layout)
Definition: dnn_interface.h:142
AVFrameSideData
Structure to hold side data for an AVFrame.
Definition: frame.h:250
DnnDetectContext::cell_w
int cell_w
Definition: vf_dnn_detect.c:50
DnnDetectContext::anchors
float * anchors
Definition: vf_dnn_detect.c:57
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: internal.h:183
ff_dnn_init
int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, AVFilterContext *filter_ctx)
Definition: dnn_filter_common.c:54
dnn_detect_get_label_id
static int dnn_detect_get_label_id(int nb_classes, int cell_size, float *label_data)
Definition: vf_dnn_detect.c:97
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
AV_PIX_FMT_YUV411P
@ AV_PIX_FMT_YUV411P
planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
Definition: pixfmt.h:80
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:79
av_strlcpy
size_t av_strlcpy(char *dst, const char *src, size_t size)
Copy the string src to dst, but no more than size - 1 bytes, and null-terminate dst.
Definition: avstring.c:85
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
av_fifo_freep2
void av_fifo_freep2(AVFifo **f)
Free an AVFifo and reset pointer to NULL.
Definition: fifo.c:286
ff_dnn_uninit
void ff_dnn_uninit(DnnContext *ctx)
Definition: dnn_filter_common.c:171
AVDetectionBBox
Definition: detection_bbox.h:26
h
h
Definition: vp9dsp_template.c:2038
dnn_detect_post_proc_tf
static int dnn_detect_post_proc_tf(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
Definition: vf_dnn_detect.c:505
ff_dnn_execute_model
int ff_dnn_execute_model(DnnContext *ctx, AVFrame *in_frame, AVFrame *out_frame)
Definition: dnn_filter_common.c:134
avstring.h
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Definition: opt.h:239
dnn_detect_parse_anchors
static int dnn_detect_parse_anchors(char *anchors_str, float **anchors)
Definition: vf_dnn_detect.c:110
DAST_NOT_READY
@ DAST_NOT_READY
Definition: dnn_interface.h:48
int
int
Definition: ffmpeg_filter.c:410
DNNAsyncStatusType
DNNAsyncStatusType
Definition: dnn_interface.h:45
AV_OPT_TYPE_CONST
@ AV_OPT_TYPE_CONST
Definition: opt.h:244
snprintf
#define snprintf
Definition: snprintf.h:34
OFFSET2
#define OFFSET2(x)
Definition: vf_dnn_detect.c:62
detection_bbox.h
AV_FIFO_FLAG_AUTO_GROW
#define AV_FIFO_FLAG_AUTO_GROW
Automatically resize the FIFO on writes, so that the data fits.
Definition: fifo.h:63
AV_FRAME_DATA_DETECTION_BBOXES
@ AV_FRAME_DATA_DETECTION_BBOXES
Bounding boxes for object detection and classification, as described by AVDetectionBBoxHeader.
Definition: frame.h:194